Load the Libraries

pacman::p_load(psych, xray , ggplot2, texreg, DT, wrapr, dplyr,
               sjmisc, sjlabelled, sjstats, sjPlot, tidyr,ggpubr,corrplot,
               knitr, kableExtra, captioner, car, excelR,ggcorrplot,Rmisc,sjmisc,funModeling,
               forcats, hrbrthemes,tidyverse,finalfit,patchwork,GGally,broom,caret,matrixTests,PerformanceAnalytics, pROC, rattle, finalfit)

Connect the Dataset

df = read.csv("Well_being_Health_Behavior_Environmental_Factors_Dataset.csv")

First Glance of the Data set

Head View of the Data Set

pacman::p_load("DT")
head(x = df,n=5) %>% datatable(rownames=TRUE,filter = "top", options=list(pageLength = 10, scrollX=T))

Tail View of the Dataset

pacman::p_load("DT")
tail(x = df,n=5) %>% datatable(rownames=TRUE,filter = "top", options=list(pageLength = 10, scrollX=T))

Head Tail View of the DataSet

pacman::p_load("DT","psych")
headTail(x = df,top = 4,bottom = 4) %>% datatable(rownames=TRUE,filter = "top", options=list(pageLength = 10, scrollX=T,scrollY=T))

Scrutinizing the Data

3S of the Dataset

Size of the Dataset

cat("The Dataset (df) contains ",nrow(df)," records and", ncol(df)," Columns or variables.")
## The Dataset (df) contains  395697  records and 33  Columns or variables.

Structure of the Data Set

str(df)
## 'data.frame':    395697 obs. of  33 variables:
##  $ X                     : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ age                   : int  30 64 49 43 68 38 23 61 78 82 ...
##  $ wellbeing             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ SocialSupport         : int  5 4 3 5 4 4 5 3 5 3 ...
##  $ race                  : int  0 1 1 1 1 1 1 1 1 0 ...
##  $ income                : int  5 5 1 5 2 5 5 NA NA NA ...
##  $ GeneralHealth         : int  4 4 5 2 5 4 1 3 3 3 ...
##  $ PoorPhysicalHealthDays: int  0 5 NA 0 30 0 0 0 4 0 ...
##  $ PoorMentalHealthDays  : int  0 0 0 0 3 0 0 0 0 0 ...
##  $ Asthma                : int  0 1 1 0 1 0 0 0 0 0 ...
##  $ HealthInsurance       : int  1 1 1 1 0 1 1 1 1 1 ...
##  $ CVD                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ LimitedActivity       : int  0 1 1 0 1 1 0 0 0 0 ...
##  $ Diabetes              : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ employ                : int  1 7 8 1 8 1 1 7 7 5 ...
##  $ BMI                   : int  3 3 3 2 1 2 2 3 2 1 ...
##  $ HeavyDrinker          : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ CurrentSmoker         : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ PoorSleepDays         : int  1 30 0 0 14 30 0 2 0 2 ...
##  $ PhysicalActivity      : int  1 0 0 1 0 0 1 1 1 0 ...
##  $ marital               : int  1 1 0 1 1 1 1 1 1 0 ...
##  $ PrematureMortality    : num  440 440 440 440 440 ...
##  $ HouseholdIncome       : int  48863 48863 48863 48863 48863 48863 48863 48863 48863 48863 ...
##  $ ParkAccess            : int  20 20 20 20 20 20 20 20 20 20 ...
##  $ CrimeRate             : int  300 300 300 300 300 300 300 300 300 300 ...
##  $ UnemploymentRate      : num  8 8 8 8 8 8 8 8 8 8 ...
##  $ WaterSafety           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ HighSchoolRate        : int  80 80 80 80 80 80 80 80 80 80 ...
##  $ SomeCollegeRate       : num  54.4 54.4 54.4 54.4 54.4 54.4 54.4 54.4 54.4 54.4 ...
##  $ AccessToRecFacility   : num  7.3 7.3 7.3 7.3 7.3 7.3 7.3 7.3 7.3 7.3 ...
##  $ FastFoodPercentage    : int  52 52 52 52 52 52 52 52 52 52 ...
##  $ PM2.5                 : num  13.1 13.1 13.1 13.1 13.1 ...
##  $ PopulationDensity     : num  91.8 91.8 91.8 91.8 91.8 91.8 91.8 91.8 91.8 91.8 ...




Interpretation from the Structure of the Dataset :



  1. The Dataset originally has only numerical and integer values

  2. Our Target variable “wellbeing” is also an integer datatype. However, we expect that into a category as we conceive it as classification thing of Wellbeing in the form of category as (yes or no) or (good or bad).
    2.Action : Convert “Wellbeing” into a factor

  3. It is also evident by looking at the structure of the dataset that several other variables like (Asthma, HealthInsurance, CVD, Diabetes, HeavyDrinker) to name a few are supposed to be categories or factors but infact they are integers.
    3.Action : Convert those set of Categorical Variables as Factors from Integers.

  4. Some set of variables are of the perfect type for example - Age is an integer and is supposed to be one. PM2.5 which represents the Fine Particle Matter in micrograms/cubic meter is actually a numerical data in a continous form.

  5. Some Variables such as BMI and income were expected to be continuous. However, they are ordinal values mentioned in integers which is a point of concern

5. Action : Convert those variables into probable factors whenver needed in Analytics

Summary of the DataSet

summary(df)
##        X               age          wellbeing     SocialSupport  
##  Min.   :     1   Min.   : 7.00   Min.   :0.000   Min.   :1.000  
##  1st Qu.: 98925   1st Qu.:45.00   1st Qu.:0.000   1st Qu.:4.000  
##  Median :197849   Median :57.00   Median :0.000   Median :5.000  
##  Mean   :197849   Mean   :56.32   Mean   :0.055   Mean   :4.181  
##  3rd Qu.:296773   3rd Qu.:69.00   3rd Qu.:0.000   3rd Qu.:5.000  
##  Max.   :395697   Max.   :99.00   Max.   :1.000   Max.   :5.000  
##                                   NA's   :17025   NA's   :20998  
##       race           income      GeneralHealth   PoorPhysicalHealthDays
##  Min.   :0.000   Min.   :1.00    Min.   :1.000   Min.   : 0.000        
##  1st Qu.:1.000   1st Qu.:2.00    1st Qu.:2.000   1st Qu.: 0.000        
##  Median :1.000   Median :4.00    Median :3.000   Median : 0.000        
##  Mean   :0.806   Mean   :3.61    Mean   :2.586   Mean   : 4.472        
##  3rd Qu.:1.000   3rd Qu.:5.00    3rd Qu.:3.000   3rd Qu.: 3.000        
##  Max.   :1.000   Max.   :5.00    Max.   :5.000   Max.   :30.000        
##  NA's   :5309    NA's   :54657   NA's   :1555    NA's   :9341          
##  PoorMentalHealthDays     Asthma       HealthInsurance       CVD       
##  Min.   : 0.000       Min.   :0.0000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.: 0.000       1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:0.000  
##  Median : 0.000       Median :0.0000   Median :1.0000   Median :0.000  
##  Mean   : 3.497       Mean   :0.0936   Mean   :0.8955   Mean   :0.067  
##  3rd Qu.: 2.000       3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:0.000  
##  Max.   :30.000       Max.   :1.0000   Max.   :1.0000   Max.   :1.000  
##  NA's   :7361         NA's   :2590     NA's   :991      NA's   :3928   
##  LimitedActivity     Diabetes          employ           BMI       
##  Min.   :0.0000   Min.   :0.0000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:1.000   1st Qu.:1.000  
##  Median :0.0000   Median :0.0000   Median :3.000   Median :2.000  
##  Mean   :0.2723   Mean   :0.1274   Mean   :3.885   Mean   :1.933  
##  3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:7.000   3rd Qu.:3.000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :8.000   Max.   :3.000  
##  NA's   :1858     NA's   :395      NA's   :1322    NA's   :17290  
##   HeavyDrinker   CurrentSmoker    PoorSleepDays    PhysicalActivity
##  Min.   :0.000   Min.   :0.0000   Min.   : 0.000   Min.   :0.0000  
##  1st Qu.:0.000   1st Qu.:0.0000   1st Qu.: 0.000   1st Qu.:0.0000  
##  Median :0.000   Median :0.0000   Median : 3.000   Median :1.0000  
##  Mean   :0.046   Mean   :0.1583   Mean   : 7.707   Mean   :0.7303  
##  3rd Qu.:0.000   3rd Qu.:0.0000   3rd Qu.:12.000   3rd Qu.:1.0000  
##  Max.   :1.000   Max.   :1.0000   Max.   :30.000   Max.   :1.0000  
##  NA's   :10742   NA's   :2463     NA's   :7560     NA's   :533     
##     marital       PrematureMortality HouseholdIncome    ParkAccess    
##  Min.   :0.0000   Min.   :133.5      Min.   : 22289   Min.   :  1.00  
##  1st Qu.:0.0000   1st Qu.:294.0      1st Qu.: 41103   1st Qu.: 13.00  
##  Median :1.0000   Median :345.0      Median : 47930   Median : 32.00  
##  Mean   :0.5552   Mean   :355.8      Mean   : 50177   Mean   : 34.58  
##  3rd Qu.:1.0000   3rd Qu.:405.7      3rd Qu.: 56166   3rd Qu.: 52.00  
##  Max.   :1.0000   Max.   :883.5      Max.   :119525   Max.   :100.00  
##  NA's   :1559                                         NA's   :9633    
##    CrimeRate      UnemploymentRate  WaterSafety      HighSchoolRate 
##  Min.   :  12.0   Min.   : 1.100   Min.   :  0.000   Min.   : 27.0  
##  1st Qu.: 202.0   1st Qu.: 7.000   1st Qu.:  0.000   1st Qu.: 74.0  
##  Median : 343.0   Median : 8.400   Median :  1.000   Median : 80.0  
##  Mean   : 405.6   Mean   : 8.716   Mean   :  6.511   Mean   : 79.1  
##  3rd Qu.: 558.0   3rd Qu.:10.200   3rd Qu.:  5.000   3rd Qu.: 86.0  
##  Max.   :2062.0   Max.   :29.700   Max.   :100.000   Max.   :100.0  
##  NA's   :4174                      NA's   :5039      NA's   :16     
##  SomeCollegeRate AccessToRecFacility FastFoodPercentage     PM2.5      
##  Min.   :23.20   Min.   : 0.00       Min.   :  8.00     Min.   : 6.50  
##  1st Qu.:54.00   1st Qu.: 6.90       1st Qu.: 44.00     1st Qu.:10.86  
##  Median :61.20   Median : 9.80       Median : 50.00     Median :11.74  
##  Mean   :60.45   Mean   :10.13       Mean   : 48.33     Mean   :11.63  
##  3rd Qu.:67.70   3rd Qu.:13.00       3rd Qu.: 54.00     3rd Qu.:12.78  
##  Max.   :90.40   Max.   :57.50       Max.   :100.00     Max.   :14.78  
##                                      NA's   :42                        
##  PopulationDensity
##  Min.   :    1.7  
##  1st Qu.:   66.3  
##  Median :  276.9  
##  Mean   : 1186.5  
##  3rd Qu.:  991.3  
##  Max.   :69468.4  
## 




Interpretation from the Structure of the Dataset :



  1. The Summary provides a good support of evidence for those points of concerns expressed earlier in the structure of the dataset. It describes the standard statistics for those variables which are supposed to be categorical variables either ordinal or nominal

  2. The Summary indicates something interesting that many records and most variables in the dataset contains null values.

  3. Action : Either Remove or Impute Null Values ensuring that in each of the case does not statistically significantly alters the dataset.

  4. The Good part that the summary iterates is that the almost all variables are within their supposed range. For an instance, no age record is negative, No PM2.5 less than 0, Household income min is not negative and so on. This is a sigh of relief.

Below are some boxplots for concerning continuous and discrete variables demonstrating their ranges with outliers

Boxplots For Range Check and Outlier Detection

Age Boxplot

This boxplot for age seems very healthy. In the original BRFSS dataset, ages 7 and 9 were designated for mising values.

Premature Mortality Boxplot

boxplot(df["PrematureMortality"],main = "Premature Mortality Box Plot", xlab = "Premature Mortality",horizontal = TRUE)

The distribution of the Premature Mortality have some outliers in the right ends shows that it is skewed to the right.

Checking Distribution of Premature Mortality

pacman::p_load("ggplot2")
ggplot(data=df,aes(x=PrematureMortality)) + geom_density()+
 ggtitle ("Premature Mortality Density Plot") +
theme (axis.title.x=element_text (size=16, face="bold", colour="blue")) +
theme (axis.text.x=element_text (size=14 ) ) 

The distribution does not seems normal but I suspect that when we apply the CLT or Bootstrapping Normality would be proven.

From the above test, we could possibly reject the null hypothesis that the distribution of premature Mortality is normal. However through Central Limit Theorem I shall prove the normality.

s30 <- c()
s50 <- c()
s500 <- c()
n =396000
for ( i in 1:n){
s30[i] = mean(sample(df$PrematureMortality,30, replace = TRUE))
s50[i] = mean(sample(df$PrematureMortality,50, replace = TRUE))
s500[i] = mean(sample(df$PrematureMortality,500, replace = TRUE))
}
par(mfrow=c(1,3))
hist(s30, col ="lightblue",main="Sample size=30",xlab ="Premature Mortality")
abline(v = mean(s30), col = "red")

hist(s50, col ="lightgreen", main="Sample size=50",xlab ="Premature Mortality")
abline(v = mean(s50), col = "red")

hist(s500, col ="orange",main="Sample size=500",xlab ="Premature Mortality")
abline(v = mean(s500), col = "red")

Interpretation

We could clearly see a bell curve with Central Limit Theorem and prove that with sufficiently large sample size any distribution that does not seem to be normally distributed gets normally distributed using Central Limit Theorem or Bootstrapping.

Household Income Boxplot

boxplot(df["HouseholdIncome"],main = "Household Income Box Plot", xlab = "Household Income",horizontal = TRUE)

The distribution of the Household Income have some outliers in the right ends shows that it is skewed to the right.

Checking Distribution of Household Income

pacman::p_load("ggplot2")
ggplot(data=df,aes(x=HouseholdIncome)) + geom_density()+
 ggtitle ("Household Income Density Plot") +
theme (axis.title.x=element_text (size=16, face="bold", colour="blue")) +
theme (axis.text.x=element_text (size=14 ) ) 

The distribution does not seems normal but I suspect that when we apply the CLT or Bootstrapping Normality would be proven.


So from now on we shall not Check with the CLT or Bootstrapping for the normal distribution of any variable rather assume that the distribution will be normal whenever viewed from the lens of CLT or Bootstrapping.

Unemployment Rate Boxplot

boxplot(df["UnemploymentRate"],main = "Unemployment Rate   Box Plot", xlab = "Unemployment Rate  ",horizontal = TRUE)

The distribution of the Unemployment Rate have some outliers on both ends more probably on the right which signifies that it is skewed more to the right.

Checking Distribution of Unemployment Rate

pacman::p_load("ggplot2")
ggplot(data=df,aes(x=UnemploymentRate)) + geom_density()+
 ggtitle ("Unemployment Rate Density Plot") +
theme (axis.title.x=element_text (size=16, face="bold", colour="blue")) +
theme (axis.text.x=element_text (size=14 ) ) 

The distribution does not seems normal but I suspect that when we apply the CLT or Bootstrapping Normality would be proven.

High School Rate Boxplot

boxplot(df["HighSchoolRate"],main = "High School Rate Box Plot", xlab = "High School Rate",horizontal = TRUE)

The distribution of the High School Rate have some outliers on left ends which signifies that it is skewed more to the left.

Checking Distribution of Unemployment Rate

pacman::p_load("ggplot2")
ggplot(data=df,aes(x=HighSchoolRate)) + geom_density()+
 ggtitle ("High School Rate Density Plot") +
theme (axis.title.x=element_text (size=16, face="bold", colour="blue")) +
theme (axis.text.x=element_text (size=14 ) ) 
## Warning: Removed 16 rows containing non-finite values (`stat_density()`).

The distribution does not seems normal but I suspect that when we apply the CLT or Bootstrapping Normality would be proven.

Some College Rate Boxplot

boxplot(df["SomeCollegeRate"],main = "Some College Rate Box Plot", xlab = "Some College Rate  ",horizontal = TRUE)

The distribution of the Some College Rate have some outliers on left ends which signifies that it is skewed more to the left.

Checking Distribution of Unemployment Rate

pacman::p_load("ggplot2")
ggplot(data=df,aes(x=SomeCollegeRate)) + geom_density()+
 ggtitle ("Some College Rate Density Plot") +
theme (axis.title.x=element_text (size=16, face="bold", colour="blue")) +
theme (axis.text.x=element_text (size=14 ) ) 

The distribution does not seems normal but I suspect that when we apply the CLT or Bootstrapping Normality would be proven.

Access To Recreational Facility Boxplot

boxplot(df["AccessToRecFacility"],main = "Access To Recreational Facility Box Plot", xlab = "Access To Recreational Facility  ",horizontal = TRUE)

The distribution of the Access To Recreational Facility have some outliers on right ends which signifies that it is skewed more to the left.

Checking Distribution of Unemployment Rate

pacman::p_load("ggplot2")
ggplot(data=df,aes(x=AccessToRecFacility)) + geom_density()+
 ggtitle ("Access To Recreational Facility Density Plot") +
theme (axis.title.x=element_text (size=16, face="bold", colour="blue")) +
theme (axis.text.x=element_text (size=14 ) ) 

The distribution does not seems normal but I suspect that when we apply the CLT or Bootstrapping Normality would be proven.

Fast Food Percentage Boxplot

boxplot(df["FastFoodPercentage"],main = "Fast Food Percentage Box Plot", xlab = "Fast Food Percentage  ",horizontal = TRUE)

The distribution of the Fast Food Percentage have some outliers on both ends

Checking Distribution of Unemployment Rate

pacman::p_load("ggplot2")
ggplot(data=df,aes(x=FastFoodPercentage)) + geom_density()+
 ggtitle ("Fast Food Percentage Density Plot") +
theme (axis.title.x=element_text (size=16, face="bold", colour="blue")) +
theme (axis.text.x=element_text (size=14 ) ) 
## Warning: Removed 42 rows containing non-finite values (`stat_density()`).

The distribution seems normal with multiple peaks and higher kurtosis

PM2.5 Boxplot

boxplot(df["PM2.5"],main = "PM2.5 Box Plot", xlab = "PM2.5  ",horizontal = TRUE)

The distribution of the PM2.5 have some outliers on left ends which signifies that it is skewed more to the left.

Checking Distribution of Unemployment Rate

pacman::p_load("ggplot2")
ggplot(data=df,aes(x=PM2.5)) + geom_density()+
 ggtitle ("PM2.5 Density Plot") +
theme (axis.title.x=element_text (size=16, face="bold", colour="blue")) +
theme (axis.text.x=element_text (size=14 ) ) 

The distribution does not seems normal but I suspect that when we apply the CLT or Bootstrapping Normality would be proven.

Population Density Boxplot

boxplot(df["PopulationDensity"],main = "Population Density Box Plot", xlab = "Population Density  ",horizontal = TRUE)

The distribution of the Population Density have some outliers on right ends which signifies that it is skewed more to the right

Checking Distribution of Unemployment Rate

pacman::p_load("ggplot2")
ggplot(data=df,aes(x=PopulationDensity)) + geom_density()+
 ggtitle ("Population Density Density Plot") +
theme (axis.title.x=element_text (size=16, face="bold", colour="blue")) +
theme (axis.text.x=element_text (size=14 ) ) 

The distribution does not seems normal but I suspect that when we apply the CLT or Bootstrapping Normality would be proven.

Interpretation of Boxplots for every variable



As far as continuous variables are concerned their distribution is quite questionable however, with Central Limit Theorem it could be deduced that they are representative of their population.
Also the boxplot help us indicates that the variables are in their expected range and have acceptable set of outliers and shall help us validate the min, max, median and range of those variables.

Inspect the presence and purview of null values

recoding missing values for age

# remove missing age values  
df <- df %>% 
  mutate(age = na_if(age, 7)) %>%
  mutate(age = na_if(age, 9))

Checking the count of null values by Variable

apply(apply(df,2,is.na),2,sum)
##                      X                    age              wellbeing 
##                      0                   3364                  17025 
##          SocialSupport                   race                 income 
##                  20998                   5309                  54657 
##          GeneralHealth PoorPhysicalHealthDays   PoorMentalHealthDays 
##                   1555                   9341                   7361 
##                 Asthma        HealthInsurance                    CVD 
##                   2590                    991                   3928 
##        LimitedActivity               Diabetes                 employ 
##                   1858                    395                   1322 
##                    BMI           HeavyDrinker          CurrentSmoker 
##                  17290                  10742                   2463 
##          PoorSleepDays       PhysicalActivity                marital 
##                   7560                    533                   1559 
##     PrematureMortality        HouseholdIncome             ParkAccess 
##                      0                      0                   9633 
##              CrimeRate       UnemploymentRate            WaterSafety 
##                   4174                      0                   5039 
##         HighSchoolRate        SomeCollegeRate    AccessToRecFacility 
##                     16                      0                      0 
##     FastFoodPercentage                  PM2.5      PopulationDensity 
##                     42                      0                      0

Plotting the null values by variable

library(tidyr)
pacman::p_load(inspectdf)
df %>% inspect_na %>% show_plot

Plot only variables with null values

df[,c(2:21,24,25,27,28,31)] %>% inspect_na %>% show_plot +
  ggtitle("Prevalence of Missing Values in the Dataset") +
  ylab("% of variable that is NA") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, vjust=1, hjust=1))

Check the rows with null values

sum(apply(df,1,anyNA))
## [1] 120094
cat("There are around ",sum(apply(df,1,anyNA)), " null records in the dataset of in total", nrow(df),"records \n","In case we delete all of the", sum(apply(df,1,anyNA)),"records with missing values then we are actually losing around", 100*(sum(apply(df,1,anyNA))/nrow(df)),"% of the total dataset.") 
## There are around  120094  null records in the dataset of in total 395697 records 
##  In case we delete all of the 120094 records with missing values then we are actually losing around 30.34999 % of the total dataset.

Data Preparation

Address the Null Value Problem

Approach - I Deletion of All Null Values

df.omit <- na.omit(df)
pacman::p_load(psych,DT)
headTail(df.omit, 10, 10) %>% datatable( rownames = FALSE, filter="top", options = list(pageLength = 21, scrollX=T))
cat(nrow(df.omit),ncol(df.omit))
## 275603 33
#write.csv(df.omit,file="df.omit.csv")

Checking For Statistical Significance Difference Pre-Post Deletion of NULL Values

Welch T-Test for Numeric Variables

# Welch T-Test
# numeric variables names
nvar_clean <- colnames(df.omit)[c(2,22:33)]

# test difference in means
mean_tt_omit <- function(a) {
  col_t_welch(df[a], df.omit[a])
}

# create means summary table
ar_omit <- array(0, dim=c(13,17)) # create empty array 13 rows, 18 columns
ar_omit <- as.data.frame(ar_omit)     # convert to dataframe


for (i in c(2,22:33)){
  ar_omit[i,1:17] <- mean_tt_omit(i)  # add results of welch t-test
  rownames(ar_omit)[i] <- colnames(df.omit)[i]  # add variable name
}

# change column names
colnames(ar_omit)[c(1:2, 4:6, 12, 13:14)] <- c("original n", "omit NAs n", "original mean", "omit NAs mean", "mean_diff", "p-value", "lower-CI", "upper-CI")  

# print table
ar_omit[nvar_clean, c(1:2, 4:6, 12, 13:14)] %>%
  datatable(caption="Summary table of difference in means between original and cleaned dataset", rownames=TRUE, options=list(pageLength=13,scrollX=T)) %>%
  formatRound(c(3:5, 7:8), digits=2) %>%
  formatRound(6, digits=5)

Chi-squared test for categorical variables

fvar_clean <- c(3,5,10:14,17:18,20:21)
chi_test <- function(a){
  chisq.test(table(df[,a]), p=table(df.omit[,a])/nrow(df.omit))
}
# create summary table
ar <- array(0, dim=c(11,9)) # create empty array 14 rows, 9 columns
ar <- as.data.frame(ar)     # convert to dataframe
for (i in fvar_clean){
  ar[i,1:9] <- chi_test(i)  # add results of chi-square test
  rownames(ar)[i] <- colnames(df)[i]  # add variable name
}
colnames(ar)[c(1,3)] <- c("Chi-square", "p-value")  # change column names
# print table
ar[fvar_clean, c(1,3)] %>%
  datatable(caption="Summary table of Chi-squared test on factor variables", rownames=TRUE, options=list(pageLength=14,scrollX=T)) %>%
  formatRound(columns=c(1:2), digits=6)

Due to large Sample size everything starts to make sense which can be seen by the p-value. Even after it shows statistically significant difference in the dataset before and after deleting null values. We tend to ignore the statistical evidence and stick to the fact that we have quite enough sample size to represent the population as a whole.

Recoding and Factorizing Variables

str(df.omit)
## 'data.frame':    275603 obs. of  33 variables:
##  $ X                     : int  1 2 4 5 6 7 12 13 14 15 ...
##  $ age                   : int  30 64 43 68 38 23 44 64 75 70 ...
##  $ wellbeing             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ SocialSupport         : int  5 4 5 4 4 5 4 3 1 5 ...
##  $ race                  : int  0 1 1 1 1 1 1 0 1 1 ...
##  $ income                : int  5 5 5 2 5 5 5 3 3 4 ...
##  $ GeneralHealth         : int  4 4 2 5 4 1 5 4 1 2 ...
##  $ PoorPhysicalHealthDays: int  0 5 0 30 0 0 21 3 0 0 ...
##  $ PoorMentalHealthDays  : int  0 0 0 3 0 0 14 2 0 0 ...
##  $ Asthma                : int  0 1 0 1 0 0 1 0 0 0 ...
##  $ HealthInsurance       : int  1 1 1 0 1 1 1 1 1 1 ...
##  $ CVD                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ LimitedActivity       : int  0 1 0 1 1 0 1 0 0 0 ...
##  $ Diabetes              : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ employ                : int  1 7 1 8 1 1 7 7 7 7 ...
##  $ BMI                   : int  3 3 2 1 2 2 2 2 2 2 ...
##  $ HeavyDrinker          : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ CurrentSmoker         : int  0 0 0 0 0 0 0 0 1 0 ...
##  $ PoorSleepDays         : int  1 30 0 14 30 0 15 10 0 15 ...
##  $ PhysicalActivity      : int  1 0 1 0 0 1 0 1 1 1 ...
##  $ marital               : int  1 1 1 1 1 1 0 1 0 0 ...
##  $ PrematureMortality    : num  440 440 440 440 440 ...
##  $ HouseholdIncome       : int  48863 48863 48863 48863 48863 48863 48863 48863 48863 48863 ...
##  $ ParkAccess            : int  20 20 20 20 20 20 20 20 20 20 ...
##  $ CrimeRate             : int  300 300 300 300 300 300 300 300 300 300 ...
##  $ UnemploymentRate      : num  8 8 8 8 8 8 8 8 8 8 ...
##  $ WaterSafety           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ HighSchoolRate        : int  80 80 80 80 80 80 80 80 80 80 ...
##  $ SomeCollegeRate       : num  54.4 54.4 54.4 54.4 54.4 54.4 54.4 54.4 54.4 54.4 ...
##  $ AccessToRecFacility   : num  7.3 7.3 7.3 7.3 7.3 7.3 7.3 7.3 7.3 7.3 ...
##  $ FastFoodPercentage    : int  52 52 52 52 52 52 52 52 52 52 ...
##  $ PM2.5                 : num  13.1 13.1 13.1 13.1 13.1 ...
##  $ PopulationDensity     : num  91.8 91.8 91.8 91.8 91.8 91.8 91.8 91.8 91.8 91.8 ...
##  - attr(*, "na.action")= 'omit' Named int [1:120094] 3 8 9 10 11 22 26 27 29 32 ...
##   ..- attr(*, "names")= chr [1:120094] "3" "8" "9" "10" ...

Identifying the Variables Requires Factorization and Recoding

Factorizing

variables_recoded = c("wellbeing","SocialSupport","race","income","GeneralHealth","Asthma","HealthInsurance","CVD","LimitedActivity","Diabetes","employ","BMI","HeavyDrinker","CurrentSmoker","PhysicalActivity","marital")
for (i in variables_recoded){
  df.omit[,i] <- as.factor(df.omit[,i])
}

Checking the Status

str(df.omit)
## 'data.frame':    275603 obs. of  33 variables:
##  $ X                     : int  1 2 4 5 6 7 12 13 14 15 ...
##  $ age                   : int  30 64 43 68 38 23 44 64 75 70 ...
##  $ wellbeing             : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ SocialSupport         : Factor w/ 5 levels "1","2","3","4",..: 5 4 5 4 4 5 4 3 1 5 ...
##  $ race                  : Factor w/ 2 levels "0","1": 1 2 2 2 2 2 2 1 2 2 ...
##  $ income                : Factor w/ 5 levels "1","2","3","4",..: 5 5 5 2 5 5 5 3 3 4 ...
##  $ GeneralHealth         : Factor w/ 5 levels "1","2","3","4",..: 4 4 2 5 4 1 5 4 1 2 ...
##  $ PoorPhysicalHealthDays: int  0 5 0 30 0 0 21 3 0 0 ...
##  $ PoorMentalHealthDays  : int  0 0 0 3 0 0 14 2 0 0 ...
##  $ Asthma                : Factor w/ 2 levels "0","1": 1 2 1 2 1 1 2 1 1 1 ...
##  $ HealthInsurance       : Factor w/ 2 levels "0","1": 2 2 2 1 2 2 2 2 2 2 ...
##  $ CVD                   : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ LimitedActivity       : Factor w/ 2 levels "0","1": 1 2 1 2 2 1 2 1 1 1 ...
##  $ Diabetes              : Factor w/ 2 levels "0","1": 1 2 1 1 1 1 1 1 1 1 ...
##  $ employ                : Factor w/ 8 levels "1","2","3","4",..: 1 7 1 8 1 1 7 7 7 7 ...
##  $ BMI                   : Factor w/ 3 levels "1","2","3": 3 3 2 1 2 2 2 2 2 2 ...
##  $ HeavyDrinker          : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 1 1 1 1 ...
##  $ CurrentSmoker         : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 2 1 ...
##  $ PoorSleepDays         : int  1 30 0 14 30 0 15 10 0 15 ...
##  $ PhysicalActivity      : Factor w/ 2 levels "0","1": 2 1 2 1 1 2 1 2 2 2 ...
##  $ marital               : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 1 2 1 1 ...
##  $ PrematureMortality    : num  440 440 440 440 440 ...
##  $ HouseholdIncome       : int  48863 48863 48863 48863 48863 48863 48863 48863 48863 48863 ...
##  $ ParkAccess            : int  20 20 20 20 20 20 20 20 20 20 ...
##  $ CrimeRate             : int  300 300 300 300 300 300 300 300 300 300 ...
##  $ UnemploymentRate      : num  8 8 8 8 8 8 8 8 8 8 ...
##  $ WaterSafety           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ HighSchoolRate        : int  80 80 80 80 80 80 80 80 80 80 ...
##  $ SomeCollegeRate       : num  54.4 54.4 54.4 54.4 54.4 54.4 54.4 54.4 54.4 54.4 ...
##  $ AccessToRecFacility   : num  7.3 7.3 7.3 7.3 7.3 7.3 7.3 7.3 7.3 7.3 ...
##  $ FastFoodPercentage    : int  52 52 52 52 52 52 52 52 52 52 ...
##  $ PM2.5                 : num  13.1 13.1 13.1 13.1 13.1 ...
##  $ PopulationDensity     : num  91.8 91.8 91.8 91.8 91.8 91.8 91.8 91.8 91.8 91.8 ...
##  - attr(*, "na.action")= 'omit' Named int [1:120094] 3 8 9 10 11 22 26 27 29 32 ...
##   ..- attr(*, "names")= chr [1:120094] "3" "8" "9" "10" ...

Recoding the Variables

df.omit <- within(df.omit, {
  wellbeing <- Recode(wellbeing, '"1" = "Poor_Wellbeing"; "0" = "Good_Wellbeing"', as.factor=TRUE, to.value="=", interval=":", 
  separator=";")
})
df.omit <- within(df.omit, {
  CurrentSmoker <- Recode(CurrentSmoker, '"1" = "Current Smoker"; "0" = "Not a Current Smoker"; ;', as.factor=TRUE, to.value="=", 
  interval=":", separator=";")
})
df.omit <- within(df.omit, {
  HeavyDrinker <- Recode(HeavyDrinker, '"1" = "Heavy Drinker"; "0" = "Not a Heavy Drinker"; ; ;', as.factor=TRUE, to.value="=", 
  interval=":", separator=";")
})

df.omit <- within(df.omit, {
  HealthInsurance <- Recode(HealthInsurance , '"1" = "With Health Coverage"; "0" = "No Health Coverage"', as.factor=TRUE, to.value="=", interval=":", 
  separator=";")
})
df.omit <- within(df.omit, {
  CVD              <- Recode(CVD, '"1" = "Cardiovascular Disease"; "0" = "No Cardiovascular Disease"; ;', as.factor=TRUE, to.value="=", 
  interval=":", separator=";")
})
df.omit <- within(df.omit, {
  Diabetes        <- Recode(Diabetes, '"1" = "Diabetic"; "0" = "Non Diabetic"; ; ;', as.factor=TRUE, to.value="=", 
  interval=":", separator=";")
})

df.omit <- within(df.omit, {
  LimitedActivity <- Recode(LimitedActivity , '"1" = "Limited"; "0" = "Not Limited"; ; ;', as.factor=TRUE, to.value="=", 
  interval=":", separator=";")
})

df.omit <- within(df.omit, {
  PhysicalActivity <- Recode(PhysicalActivity , '"1" = "Physically Active"; "0" = "Not Physically Active"; ; ;', as.factor=TRUE, to.value="=", 
  interval=":", separator=";")
})

df.omit <- within(df.omit, {
  employ <- Recode(employ , '"1" = "Waged Employement"; "2" = "Self-Employed";"3" = "Unemployed for more than 1 yr" ;"4" = "Unemployed for less than 1 yr";"5" = "Homemaker";"6" = "Student";"7" = "Retired";"8" = "Unable to work" ;', as.factor=TRUE, to.value="=", 
  interval=":", separator=";")
})

df.omit <- within(df.omit, {
  BMI <- Recode(BMI , '"1" = "Healthy weight"; "2" = "Overweight";"3" = "Obese" ;;;', as.factor=TRUE, to.value="=", 
  interval=":", separator=";")
})

df.omit$income <- recode_factor(df.omit$income, 
                                       "1" = "less than 15000", 
                                       "2" = "[15000-25000)", 
                                       "3" = "[25000-35000)",
                                       "4" = "[35000-50000)", 
                                       "5" = "50000 or more")

df.omit$Asthma <- recode_factor(df.omit$Asthma, '0' = "No Asthma", '1' = "Asthma")

df.omit$SocialSupport <- recode_factor(df.omit$SocialSupport, '1' = "Never", '2' = " Rarely", '3' = " Sometimes", '4' = "Usually", '5' = "Always")

df.omit$race <- recode_factor(df.omit$race, '0' = "Other races", '1' = "White")

df.omit$GeneralHealth <- recode_factor(df.omit$GeneralHealth, '1' = "Excellent",
                                      '2' = "Very good",
                                      '3' = "Good",
                                      '4' = "Fair",
                                      '5' = "Poor")

df.omit$marital <- recode_factor(df.omit$marital, '0' = "Not married", '1' = "Married")

Saving the Furbished File

# write.csv(df.omit,"Final_Furbished.csv",row.names = FALSE)

Descriptive Analytics

Access the data from the furbished file

df_final = read.csv("Final_Furbished.csv")
variables_recoded = c("wellbeing","SocialSupport","race","income","GeneralHealth","Asthma","HealthInsurance","CVD","LimitedActivity","Diabetes","employ","BMI","HeavyDrinker","CurrentSmoker","PhysicalActivity","marital")
for (i in variables_recoded){
  df_final[,i] <- as.factor(df_final[,i])
}
summary(df_final)
##        X               age                 wellbeing         SocialSupport   
##  Min.   :     1   Min.   :18.00   Good_Wellbeing:260883    Rarely   :  8630  
##  1st Qu.: 98517   1st Qu.:44.00   Poor_Wellbeing: 14720    Sometimes: 29706  
##  Median :196815   Median :56.00                           Always    :142109  
##  Mean   :197112   Mean   :55.55                           Never     : 12615  
##  3rd Qu.:295444   3rd Qu.:67.00                           Usually   : 82543  
##  Max.   :395697   Max.   :99.00                                              
##                                                                              
##           race                    income         GeneralHealth  
##  Other races: 49755   [15000-25000)  : 45957   Excellent:52300  
##  White      :225848   [25000-35000)  : 32151   Fair     :33709  
##                       [35000-50000)  : 41985   Good     :81136  
##                       50000 or more  :127154   Poor     :14478  
##                       less than 15000: 28356   Very good:93980  
##                                                                 
##                                                                 
##  PoorPhysicalHealthDays PoorMentalHealthDays       Asthma      
##  Min.   : 0.000         Min.   : 0.000       Asthma   : 25592  
##  1st Qu.: 0.000         1st Qu.: 0.000       No Asthma:250011  
##  Median : 0.000         Median : 0.000                         
##  Mean   : 4.202         Mean   : 3.437                         
##  3rd Qu.: 3.000         3rd Qu.: 2.000                         
##  Max.   :30.000         Max.   :30.000                         
##                                                                
##              HealthInsurance                          CVD        
##  No Health Coverage  : 27354   Cardiovascular Disease   : 17940  
##  With Health Coverage:248249   No Cardiovascular Disease:257663  
##                                                                  
##                                                                  
##                                                                  
##                                                                  
##                                                                  
##     LimitedActivity           Diabetes     
##  Limited    : 72270   Diabetic    : 32752  
##  Not Limited:203333   Non Diabetic:242851  
##                                            
##                                            
##                                            
##                                            
##                                            
##                            employ                   BMI        
##  Waged Employement            :121285   Healthy weight: 94336  
##  Retired                      : 74523   Obese         : 79608  
##  Self-Employed                : 23118   Overweight    :101659  
##  Homemaker                    : 18449                          
##  Unable to work               : 17507                          
##  Unemployed for more than 1 yr:  8814                          
##  (Other)                      : 11907                          
##               HeavyDrinker                 CurrentSmoker    PoorSleepDays   
##  Heavy Drinker      : 14131   Current Smoker      : 44088   Min.   : 0.000  
##  Not a Heavy Drinker:261472   Not a Current Smoker:231515   1st Qu.: 0.000  
##                                                             Median : 3.000  
##                                                             Mean   : 7.772  
##                                                             3rd Qu.:12.000  
##                                                             Max.   :30.000  
##                                                                             
##               PhysicalActivity         marital       PrematureMortality
##  Not Physically Active: 68341   Married    :159034   Min.   :133.5     
##  Physically Active    :207262   Not married:116569   1st Qu.:290.6     
##                                                      Median :341.3     
##                                                      Mean   :350.7     
##                                                      3rd Qu.:400.2     
##                                                      Max.   :883.5     
##                                                                        
##  HouseholdIncome    ParkAccess       CrimeRate      UnemploymentRate
##  Min.   : 23751   Min.   :  1.00   Min.   :  12.0   Min.   : 1.100  
##  1st Qu.: 41796   1st Qu.: 13.00   1st Qu.: 200.0   1st Qu.: 6.900  
##  Median : 48622   Median : 33.00   Median : 343.0   Median : 8.400  
##  Mean   : 50813   Mean   : 34.76   Mean   : 397.7   Mean   : 8.625  
##  3rd Qu.: 56750   3rd Qu.: 53.00   3rd Qu.: 544.0   3rd Qu.:10.200  
##  Max.   :119525   Max.   :100.00   Max.   :1627.0   Max.   :29.700  
##                                                                     
##   WaterSafety      HighSchoolRate   SomeCollegeRate AccessToRecFacility
##  Min.   :  0.000   Min.   : 27.00   Min.   :24.20   Min.   : 0.00      
##  1st Qu.:  0.000   1st Qu.: 74.00   1st Qu.:54.70   1st Qu.: 7.20      
##  Median :  1.000   Median : 81.00   Median :61.90   Median : 9.90      
##  Mean   :  6.427   Mean   : 79.36   Mean   :61.13   Mean   :10.34      
##  3rd Qu.:  5.000   3rd Qu.: 86.00   3rd Qu.:68.30   3rd Qu.:13.00      
##  Max.   :100.000   Max.   :100.00   Max.   :90.40   Max.   :51.60      
##                                                                        
##  FastFoodPercentage     PM2.5       PopulationDensity
##  Min.   :  8.00     Min.   : 6.50   Min.   :    1.7  
##  1st Qu.: 44.00     1st Qu.:10.83   1st Qu.:   69.8  
##  Median : 50.00     Median :11.69   Median :  290.2  
##  Mean   : 48.22     Mean   :11.59   Mean   : 1033.7  
##  3rd Qu.: 54.00     3rd Qu.:12.78   3rd Qu.:  990.8  
##  Max.   :100.00     Max.   :14.78   Max.   :69468.4  
## 
psych::describe(df_final, skew=FALSE, ranges=TRUE, quant=NULL, IQR=FALSE) %>%
  datatable(rownames=TRUE,
            options=list(pageLength=10,scrollX=T)) %>%
  formatRound(c(3:4,8), 2)

Graph Styling

my_colors <- c("#25ADB4","#777777")
my_colors2 <- c("#67d1d3","#777777")
mycolors2 <- c("#434343","#44c5c8","#67d1d3","#8ddcde","#baeaea")
my_color_palette <- c("#FF5656", "#1A78E0")

Checking distribution of categorical variables

Wellbeing

df_final %>%
ggplot(aes(x = wellbeing, fill = wellbeing)) + geom_bar(aes(y = after_stat(count)/sum(after_stat(count)))) + labs(title = "Wellbeing Distribution") + xlab("Wellbeing") + ylab("Percentage") 

# add labels
df_final %>%
  ggplot(aes(x = wellbeing, fill = wellbeing)) +
  geom_bar() + 
  stat_count(aes(label = paste( round(prop.table(..count..), digits=3) * 100, "%", sep = "")),
             vjust = -1, geom = "text", position = "identity", color ="black", size=5) + 
  labs(title = "Wellbeing Distribution") + 
  ylim(0,300000) +
  xlab("Wellbeing") + 
  ylab("Number of Individuals") + 
  theme_bw() +
  theme(legend.position = "none") +
  scale_fill_manual(values=my_colors2)
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.

### Age

df_final$Age_group <- ifelse(df_final$age < 25, "Age 18 to 24",ifelse(df_final$age < 35, "Age 25 to 34",ifelse(df_final$age < 45, "Age 35 to 44", ifelse(df_final$age < 55, "Age 44 to 54",ifelse(df_final$age < 65, "Age 55 to 64", "Age 65 or older")))))
df_final %>%
ggplot(aes(x = reorder(Age_group, Age_group, length, decreasing = FALSE), fill = Age_group, angle = 45)) +
geom_bar(aes(y = (..count..)/sum(..count..))) + labs(title = "Age Group Distribution") + xlab("Age group") + ylab("Percentage") + theme(legend.position = "none",axis.text.x = element_text(angle = 45))

# add labels
df_final %>%
  ggplot(aes(x = reorder(Age_group, Age_group, length, decreasing = FALSE), fill = Age_group)) +
  geom_bar() + 
  stat_count(aes(label = paste( round(prop.table(..count..), digits=3) * 100, "%", sep = "")),
             vjust = -1, geom = "text", position = "identity", color ="black", size=5) + 
  ylim(0,90000) +
  labs(title = "Age Group Distribution") + 
  xlab("Age group") + 
  ylab("Number of Individuals") + 
  theme_bw() +
  theme(legend.position = "none")

### Social Support

df_final %>%
ggplot(aes(x = SocialSupport, fill = SocialSupport)) + geom_bar() + geom_bar(aes(y = (..count..)/sum(..count..))) + labs(title = "Social Support Distribution") + xlab("Social Support") + ylab("Percentage")

### Income

df_final %>%
ggplot(aes(x = income, fill = income)) + geom_bar(aes(y = (..count..)/sum(..count..))) + labs(title = "Income Distribution") + xlab("Income") + ylab("Percentage") + theme(legend.position = "none",axis.text.x = element_text(angle = 45))

### Employment Status

# employment status
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
## The following objects are masked from 'package:psych':
## 
##     alpha, rescale
df_final %>%
  ggplot(aes(x = reorder(employ, employ, length, decreasing = FALSE), fill = employ)) +
  geom_bar() + 
  stat_count(aes(label = paste( round(prop.table(..count..), digits=3) * 100, "%", sep = "")),
             vjust = -1, geom = "text", position = "identity", color ="black", size=5) + 
  ylim(0,150000) +
  labs(title = "Employment Status Distribution") + 
  xlab("Employment Status") + 
  ylab("Number of Individuals") + 
  theme_bw() +
  scale_x_discrete(labels = wrap_format(10)) +
  theme(legend.position = "none")

General Health

df_final %>%
ggplot(aes(x = GeneralHealth, fill = GeneralHealth)) + geom_bar(aes(y = (..count..)/sum(..count..))) + labs(title = "General Health Distribution") + xlab("General Health Status") + ylab("Percentage") + theme(legend.position = "none",axis.text.x = element_text(angle = 45))

### Asthma

df_final %>%
ggplot(aes(x = Asthma, fill = Asthma)) + geom_bar(aes(y = (..count..)/sum(..count..))) + labs(title = "Asthma Distribution") + xlab("Asthma") + ylab("Percentage") 

### Race

df_final %>%
ggplot(aes(x = race, fill = race)) + geom_bar(aes(y = (..count..)/sum(..count..))) + labs(title = "Race Distribution") + xlab("Race") + ylab("Percentage") 

race_bar <- df_final %>%
  ggplot(aes(x = race, fill = race)) + 
  geom_bar() +
  stat_count(aes(label = paste( round(prop.table(..count..), digits=3) * 100, "%", sep = "")), vjust = -1, geom = "text", position = "identity", color ="black", size=5) + 
  ylim(0,260000) + 
  labs(title = "Race Distribution") + 
  xlab("Race") + 
  ylab("Percentage") + 
  theme_bw() +
  theme(legend.position = "none") +
  scale_fill_brewer(palette="Dark2")

race_bar

### Marital Status Distribution

# bar chart
df_final %>%
  ggplot(aes(x = marital, fill = marital)) + 
  geom_bar() +
  stat_count(aes(label = paste( round(prop.table(..count..), digits=3) * 100, "%", sep = "")), vjust = -1, geom = "text", position = "identity", color ="black", size=5) + 
  ylim(0,200000) + 
  labs(title = "Marital Status Distribution") + 
  xlab("Marital Status") + 
  ylab("Percentage") + 
  theme_bw() +
  theme(legend.position = "none") +
  scale_fill_brewer(palette = "Set1")

Bivariate analysis

Age and Race

# age and race
age_race_plot <- df.omit %>% 
  ggdensity(  x = "age" ,
     add = "mean", rug = FALSE,
     color = "race", fill = "race", palette = "Dark2", alpha=0.25) +
  ggtitle("Density Graph of Age by Race") +
  theme_bw() +
  theme(legend.position="right")

age_race_plot

p_ra <- df_final %>%
  ggplot(aes(x = Age_group, fill = race)) + 
  geom_bar(position = "fill") + 
  ylab("proportion") + 
  labs(title = "Age Group and Race Distribution") + 
  theme_bw() +
  theme(legend.position="right") +
  scale_fill_brewer(palette="Dark2")
p_ra

Wellbeing and Race

p3 <- df_final %>%
  ggplot(aes(x = race, fill = wellbeing)) + geom_bar(position = "fill") + ylab("proportion") + labs(title = "Wellbeing status on different races")
p3

table(df_final$wellbeing, df_final$race)
##                 
##                  Other races  White
##   Good_Wellbeing       46493 214390
##   Poor_Wellbeing        3262  11458
rx.p1 <- df_final %>% 
  ggplot(aes(x = race, fill = wellbeing)) + 
  geom_bar() + 
  theme(legend.position = "none")

rx.p2 <- df_final %>% 
  ggplot(aes(x = race, fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  ylab("proportion")

library(patchwork)
rx.p1 + rx.p2


There seems to be minimal visual difference in the percentages of the Poor and Good Wellbeing for Race Non white and Hispanic and Race White and Hispanic

Wellbeing and Social Support

p1a <- df_final %>%
 ggplot(aes(x = SocialSupport, fill = wellbeing)) + geom_bar(position = "fill") + ylab("proportion") + labs(title = "Wellbeing status on different social support level")
p1a

p1a2 <- df_final %>%
 ggplot(aes(x = SocialSupport, fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  ylab("proportion") + 
  labs(title = "Wellbeing status on different social support level") +
  theme_bw() +
  scale_fill_manual(values=my_colors2) +
  theme(
    legend.position="bottom",
    axis.text.x = element_text(size=8, face="bold"),)
p1a2

#### Wellbeing, Social Support, and Race

tab_xtab(df_final$SocialSupport, df_final$wellbeing, show.cell.prc=TRUE)
SocialSupport wellbeing Total
Good_Wellbeing Poor_Wellbeing
Rarely 5361
1.9 %
3269
1.2 %
8630
3.1 %
Sometimes 24667
9 %
5039
1.8 %
29706
10.8 %
Always 140156
50.9 %
1953
0.7 %
142109
51.6 %
Never 10807
3.9 %
1808
0.7 %
12615
4.6 %
Usually 79892
29 %
2651
1 %
82543
30 %
Total 260883
94.7 %
14720
5.3 %
275603
100 %
χ2=33188.934 · df=4 · Cramer’s V=0.347 · p=0.000
p1a2 + facet_wrap(~race)

Wellbeing, Social Support, and Age

p1a2 + facet_wrap(~Age_group)

Wellbeing and Income

p2 <- df_final %>%
  ggplot(aes(x = income, fill = wellbeing)) + geom_bar(position = "fill") + ylab("proportion") + theme(axis.text.x = element_text(angle = 30)) + labs(title = "Wellbeing status on different income status")
p2

#### Bin Median HouseholdIncome (county)

# bin mean household income to match individual income bins
df_final <- df_final %>%
  mutate(HouseholdIncome.bin = cut(HouseholdIncome, breaks=c(0, 24999, 34999, 49999, 119525)))

df_final$HouseholdIncome.bin <- recode_factor(df_final$HouseholdIncome.bin, "(0,2.5e+04]" = "less than 25000", "(2.5e+04,3.5e+04]" = "[25000,35000)", "(3.5e+04,5e+04]" = "[35000,50000)","(5e+04,1.2e+05]" = "50000 or more")
frq(df.omit$HouseholdIncome.bin)
## NULL
tab_xtab(df_final$HouseholdIncome.bin, df_final$wellbeing, show.col.prc = TRUE, show.exp = TRUE)  
HouseholdIncome.bin wellbeing Total
Good_Wellbeing Poor_Wellbeing
less than 25000 203
203
0.1 %
11
11
0.1 %
214
214
0.1 %
[25000,35000) 16170
16368
6.2 %
1122
924
7.6 %
17292
17292
6.3 %
[35000,50000) 123944
124464
47.5 %
7543
7023
51.2 %
131487
131487
47.7 %
50000 or more 120566
119848
46.2 %
6044
6762
41.1 %
126610
126610
45.9 %
Total 260883
260883
100 %
14720
14720
100 %
275603
275603
100 %
χ2=166.368 · df=3 · Cramer’s V=0.025 · p=0.000
# compare againt individual income
df_final %>% 
  group_by(HouseholdIncome.bin) %>%
  ggplot(aes(x = income, fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  ylab("proportion") +
  ggtitle("Wellbeing status by income level, grouped by county median household income") +
  facet_wrap(~HouseholdIncome.bin, ncol=4) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, vjust=1, hjust=1)) +
  scale_fill_manual(values=my_colors2)

Wellbeing, Income, Age

# income and wellbeing
df_final %>% 
  ggplot(aes(x = income, fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  ylab("proportion") +
  ggtitle("Wellbeing status on different income levels, grouped by race") +
  facet_wrap(~race) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, vjust=1, hjust=1)) +
  theme(legend.position="bottom") +
  scale_fill_manual(values=my_colors2)

Wellbeing and Asthma

p4 <- df_final %>%
  ggplot(aes(x = Asthma, fill = wellbeing)) + geom_bar(position = "fill") + ylab("proportion") + labs(title = "Wellbeing status between people having or not Asthma")
p4

Wellbeing and Age

p5 <- df_final %>%
  ggplot(aes(x = Age_group, fill = wellbeing)) + geom_bar(position = "fill") + ylab("proportion") + labs(title = "Wellbeing status on different age group") + theme(axis.text.x = element_text(angle = 0)) + theme_bw() + scale_fill_manual(values=my_colors2)
p5

Wellbeing and General Health

p6 <- df_final %>%
  ggplot(aes(x = GeneralHealth, fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  ylab("proportion") + 
  labs(title = "Wellbeing status on different general health status") +
  theme_bw() +
  scale_fill_manual(values=my_colors2) +
  theme(
    legend.position="bottom",
    axis.text.x = element_text(size=12, face="bold"),)
p6 

#### Wellbeing, General Health, and Race

tab_xtab(df_final$GeneralHealth, df_final$wellbeing, show.cell.prc=TRUE)
GeneralHealth wellbeing Total
Good_Wellbeing Poor_Wellbeing
Excellent 51525
18.7 %
775
0.3 %
52300
19 %
Fair 29680
10.8 %
4029
1.5 %
33709
12.3 %
Good 77287
28 %
3849
1.4 %
81136
29.4 %
Poor 10663
3.9 %
3815
1.4 %
14478
5.3 %
Very good 91728
33.3 %
2252
0.8 %
93980
34.1 %
Total 260883
94.7 %
14720
5.3 %
275603
100 %
χ2=18764.085 · df=4 · Cramer’s V=0.261 · p=0.000
p6 + facet_wrap(~race)

Wellbeing and CVD

p8 <- df_final %>%
  ggplot(aes(x = CVD, fill = wellbeing)) + geom_bar(position = "fill") + ylab("proportion") + labs(title = "Wellbeing status on patient with and without CVD")
p8 

table(df_final$wellbeing, df_final$CVD)
##                 
##                  Cardiovascular Disease No Cardiovascular Disease
##   Good_Wellbeing                  16318                    244565
##   Poor_Wellbeing                   1622                     13098
rx.p1 <- df_final %>% 
  ggplot(aes(x = CVD, fill = wellbeing)) + 
  geom_bar() + 
  theme(legend.position = "none")

rx.p2 <- df_final %>% 
  ggplot(aes(x = CVD, fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  ylab("proportion")

library(patchwork)
rx.p1 + rx.p2

There seems to be visual difference in the percentages of the Poor and Good Wellbeing for poeple with and without CVD

Wellbeing and Physical Activity

table(df_final$wellbeing, df_final$HealthInsurance)
##                 
##                  No Health Coverage With Health Coverage
##   Good_Wellbeing              24146               236737
##   Poor_Wellbeing               3208                11512
x1 = df_final %>% 
  ggplot(aes(x = PhysicalActivity , fill = wellbeing)) + 
  geom_bar() + 
  theme(legend.position = "none")


x2 = df_final %>% 
  ggplot(aes(x = PhysicalActivity , fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  ylab("proportion") +
  xlab("Physical Activity in the Last 30 Days") +
  ggtitle("Physical Activity or Exercise in the Last 30 Days") +
  scale_x_discrete(labels = wrap_format(10)) +
  theme_bw() +
  scale_fill_manual(values=my_colors2) +
  theme(legend.position = "none",
        axis.text.x = element_text( size=14, face="bold"))

x1 + x2

Wellbeing and Smoking Status

smoke.p2 <- df_final %>% 
  ggplot(aes(x = CurrentSmoker, fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  ylab("proportion") +
  xlab("Current Smoking Status") +
  ggtitle("Current Smoking Status") +
  scale_x_discrete(labels = wrap_format(10)) +
  theme_bw() +
  scale_fill_manual(values=my_colors2) +
  theme(legend.position = "none",
        axis.text.x = element_text( size=14, face="bold"))

library(patchwork)
smoke.p2

Wellbeing and LimitedActivity

table(df_final$LimitedActivity, df_final$wellbeing)
##              
##               Good_Wellbeing Poor_Wellbeing
##   Limited              63039           9231
##   Not Limited         197844           5489
 df_final %>% 
  ggplot(aes(x = LimitedActivity , fill = wellbeing)) + 
  geom_bar() + 
  theme(legend.position = "none")

df_final %>% 
  ggplot(aes(x = LimitedActivity , fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  ylab("proportion")

It could be confirmed visually that with Low Physical Activitys there is a significant increase in proportion of the Poor Wellbeing of individuals

limact_p <- df_final %>% 
  ggplot(aes(x = LimitedActivity , fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  ylab("proportion") +
  xlab("Limited activity due to health problems") +
  ggtitle("Limited Activity due to Health Problems") +
  theme_bw() +
  scale_fill_manual(values=my_colors2) +
  theme(legend.position = "none")

limact_p

Wellbeing and Diabetes

table(df_final$wellbeing, df_final$Diabetes)
##                 
##                  Diabetic Non Diabetic
##   Good_Wellbeing    30024       230859
##   Poor_Wellbeing     2728        11992
rx.p1 <- df_final %>% 
  ggplot(aes(x = Diabetes, fill = wellbeing)) + 
  geom_bar() + 
  theme(legend.position = "none")

rx.p2 <- df_final %>% 
  ggplot(aes(x = Diabetes, fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  ylab("proportion")

library(patchwork)
rx.p1 + rx.p2


There seems to be a visual difference in the percentages of the Poor and Good Wellbeing between Diabetic and Non Diabetic individuals.

Wellbeing and Employment

table(df_final$wellbeing, df_final$employ)
##                 
##                  Homemaker Retired Self-Employed Student Unable to work
##   Good_Wellbeing     17872   72112         22313    3822          13343
##   Poor_Wellbeing       577    2411           805     229           4164
##                 
##                  Unemployed for less than 1 yr Unemployed for more than 1 yr
##   Good_Wellbeing                          6786                          7285
##   Poor_Wellbeing                          1070                          1529
##                 
##                  Waged Employement
##   Good_Wellbeing            117350
##   Poor_Wellbeing              3935
 df_final %>% 
  ggplot(aes(x = employ, fill = wellbeing)) + 
  geom_bar() + 
  theme(legend.position = "none")

df_final %>% 
  ggplot(aes(x = employ, fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  xlab("employment status") +
  ylab("proportion") +
  ggtitle("Wellbeing status by employment status") +
  theme_bw() +
  scale_x_discrete(labels = wrap_format(10)) +
  scale_fill_manual(values=my_colors2) +
  theme(legend.position = "top")

Wellbeing, age, employments status

p_empa <- df_final %>%
  ggplot(aes(x = employ, fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  xlab("employment status") +
  ylab("proportion") +
  ggtitle("Wellbeing status by employment status, group by age") +
  facet_wrap(~Age_group, ncol=2) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, vjust=1, hjust=1)) +
  scale_fill_manual(values=my_colors2) +
  theme(legend.position = "top")
p_empa


–> There seems to be minimal visual difference in the percentages of the Poor and Good Wellbeing for Homemaker, Retired and Waged Employment
–> There is relatively higher proportions of Poor Wellbeing in students than the prior-mentioned employment groups.
–> Individuals who are unable to work or are unemployed (either more than a year or less than a year) have significantly higher proportions of Poor Wellbeing than any other employment status groups.

Wellbeing and Health care access (insurance)

rx.p1 <- df_final %>% 
  ggplot(aes(x = HealthInsurance, fill = wellbeing)) + 
  geom_bar() +  
  ylab("number of individuals") +
  theme_bw() +
  scale_fill_manual(values=my_colors2) +
  theme(legend.position = "none") 

rx.p2 <- df_final %>% 
  ggplot(aes(x = HealthInsurance, fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  ylab("proportion") +
  theme_bw() +
  scale_fill_manual(values=my_colors2)

library(patchwork)
rx.p1 + rx.p2 + ggtitle("Wellbeing status by health insurance")

Wellbeing and BMI

table(df_final$wellbeing, df_final$BMI)
##                 
##                  Healthy weight Obese Overweight
##   Good_Wellbeing          89792 73957      97134
##   Poor_Wellbeing           4544  5651       4525
 df_final %>% 
  ggplot(aes(x = BMI, fill = wellbeing)) + 
  geom_bar() + 
  theme(legend.position = "none")

df_final %>% 
  ggplot(aes(x = BMI, fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  ylab("proportion")


Individuals with higher BMI tend to have slightly higher proportions of “POOR” wellbeing.

Wellbeing and Poor Sleep Days

table(df_final$PoorSleepDays, df_final$wellbeing)
##     
##      Good_Wellbeing Poor_Wellbeing
##   0           93145           2527
##   1            8051            164
##   2           21461            487
##   3           14905            416
##   4            9838            322
##   5           20899            733
##   6            3987            180
##   7            5813            308
##   8            2415            108
##   9             251             17
##   10          17910           1006
##   11             61              9
##   12           1624            115
##   13             88              8
##   14           1728            160
##   15          15792           1396
##   16            233             34
##   17            137             24
##   18            269             37
##   19             34              5
##   20          11946           1339
##   21            408             64
##   22            186             34
##   23             79             21
##   24            180             26
##   25           4405            632
##   26            240             36
##   27            294             40
##   28           1418            201
##   29            641             83
##   30          22445           4188
 df_final %>% 
  ggplot(aes(x = PoorSleepDays , fill = wellbeing)) + 
  geom_bar() + 
  theme(legend.position = "none")

df_final %>% 
  ggplot(aes(x = PoorSleepDays , fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  ylab("proportion")

It could be seen visually that with increment in the Poor Sleep days there is a significant increase in the Poor Wellbeing of indivduals.

Wellbeing and PhysicalActivity

table(df_final$PhysicalActivity, df_final$wellbeing)
##                        
##                         Good_Wellbeing Poor_Wellbeing
##   Not Physically Active          61658           6683
##   Physically Active             199225           8037
x1 = df_final %>% 
  ggplot(aes(x = PhysicalActivity , fill = wellbeing)) + 
  geom_bar() + 
  theme(legend.position = "none")


x2 =df_final %>% 
  ggplot(aes(x = PhysicalActivity , fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  ylab("proportion")
library(patchwork)
x1+x2

It could be confirmed visually that with Low Physical Activitys there is a significant increase in proportion of the Poor Wellbeing of individuals

Wellbeing and Smoking Status

table(df_final$wellbeing, df_final$CurrentSmoker)
##                 
##                  Current Smoker Not a Current Smoker
##   Good_Wellbeing          38848               222035
##   Poor_Wellbeing           5240                 9480
rx.p1 <- df_final %>% 
  ggplot(aes(x = CurrentSmoker, fill = wellbeing)) + 
  geom_bar() + 
  theme(legend.position = "none")

rx.p2 <- df_final %>% 
  ggplot(aes(x = CurrentSmoker, fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  ylab("proportion")

library(patchwork)
rx.p1 + rx.p2

From the Visual Evidence, it could be said that the Smoking Status affect the Wellbeing of an individual. It could interpreted from the graph that more number of current smokers possess poor wellbeing as compared to those who are not a current smoker.

Wellbeing and Heavy Drinking Habits

table(df_final$wellbeing, df_final$HeavyDrinker)
##                 
##                  Heavy Drinker Not a Heavy Drinker
##   Good_Wellbeing         13172              247711
##   Poor_Wellbeing           959               13761
rx.p1 <- df_final %>% 
  ggplot(aes(x = HeavyDrinker, fill = wellbeing)) + 
  geom_bar() + 
  theme(legend.position = "none")

rx.p2 <- df_final %>% 
  ggplot(aes(x = HeavyDrinker, fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  ylab("proportion")

library(patchwork)
rx.p1 + rx.p2

There seems to be minimal visual difference in the percentages of the Poor and Good Wellbeing for Heavy Drinker and not a Heavy Drinker.

Wellbeing and Poor Health Days

Poor Physical Health Days

table(df_final$PoorPhysicalHealthDays, df_final$wellbeing)
##     
##      Good_Wellbeing Poor_Wellbeing
##   0          169695           4619
##   1           12060            417
##   2           15411            733
##   3            8575            544
##   4            4536            291
##   5            7890            551
##   6            1244            113
##   7            4433            351
##   8             775             61
##   9             142             22
##   10           5668            561
##   11             48              6
##   12            508             79
##   13             58              6
##   14           2460            216
##   15           4750            708
##   16            109             35
##   17             68             21
##   18            132             33
##   19             20              8
##   20           2962            591
##   21            573             70
##   22             67             13
##   23             44              9
##   24             51             17
##   25           1193            294
##   26             64              7
##   27             92             25
##   28            435            128
##   29            172             71
##   30          16648           4120
 df_final %>% 
  ggplot(aes(x = PoorPhysicalHealthDays , fill = wellbeing)) + 
  geom_bar() + 
  theme(legend.position = "none")

df_final %>% 
  ggplot(aes(x = PoorPhysicalHealthDays , fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  ylab("proportion")

It could be seen visually that with increment in the Poor Physical Health days there is a significant increase in the Poor Wellbeing of indivduals

Poor Mental Health Days

table(df_final$PoorMentalHealthDays, df_final$wellbeing)
##     
##      Good_Wellbeing Poor_Wellbeing
##   0          183277           2907
##   1            9195            229
##   2           14268            550
##   3            7948            397
##   4            3895            277
##   5            9611            706
##   6             914            110
##   7            3268            342
##   8             700             78
##   9              88             19
##   10           6480            922
##   11             21              6
##   12            368             77
##   13             36             10
##   14           1191            197
##   15           5143           1151
##   16             60             16
##   17             44             17
##   18             62             28
##   19              9              3
##   20           2953            995
##   21            217             70
##   22             35             21
##   23             22              4
##   24             39             10
##   25            976            420
##   26             33             10
##   27             66             34
##   28            285            121
##   29            188             70
##   30           9491           4923
 df_final %>% 
  ggplot(aes(x = PoorMentalHealthDays , fill = wellbeing)) + 
  geom_bar() + 
  theme(legend.position = "none")

df_final %>% 
  ggplot(aes(x = PoorMentalHealthDays , fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  ylab("proportion")

It could be seen visually that with increment in the Poor Mental Health days there is a significant increase in the Poor Wellbeing of indivduals.

Wellbeing, poor mental health, drinking

df_final %>% 
  ggplot(aes(x = PoorMentalHealthDays , fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  ylab("proportion") +
  facet_wrap(~HeavyDrinker) +
  theme_bw() +
  scale_fill_manual(values=my_colors2) +
  theme(legend.position = "bottom")

Comparing the increment of Poor Wellbeing in Poor Mental and Poor Physical Wellbeing

r1 = df_final %>% 
  ggplot(aes(x = PoorPhysicalHealthDays , fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  ylab("proportion")

r2 = df_final %>% 
  ggplot(aes(x = PoorMentalHealthDays , fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  ylab("proportion")

library(patchwork)
r1+r2

#### Comparing Poor Mental, Physical and Poor Sleep Days

poor_mh_p <- df_final %>% 
  ggplot(aes(x = PoorMentalHealthDays , fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  ylab("proportion") +
  xlab("Number of poor mental health days") +
  ggtitle("Poor Mental Health Days") +
  scale_x_discrete(labels = wrap_format(10)) +
  theme_bw() +
  scale_fill_manual(values=my_colors2) +
  theme(legend.position = "none")

poor_ph_p <- df_final %>% 
  ggplot(aes(x = PoorPhysicalHealthDays , fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  ylab("proportion") +
  xlab("Number of poor physical health days") +
  ggtitle("Poor Physical Health Days") +
  scale_x_discrete(labels = wrap_format(10)) +
  theme_bw() +
  scale_fill_manual(values=my_colors2) +
  theme(legend.position = "none")

poor_s_p <- df_final %>% 
  ggplot(aes(x = PoorSleepDays , fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  ylab("proportion") +
  xlab("Number of poor sleep days") +
  scale_x_discrete(labels = wrap_format(10)) +
  ggtitle("Poor Sleep Days") +
  theme_bw() +
  scale_fill_manual(values=my_colors2) +
  theme(legend.position = "none")  +
  theme(legend.position = "right")

poor_mh_p + poor_ph_p + poor_s_p

It is visually prominent that in both cases whenevr there is increase in the number of Poor Physical Health days or Poor Mental Health Days there seems to be almost linear increase in the Poor Wellbeing of the individuals

This also coduld be deduced from the graph that there is a clear association between Poor Health Days (Mental or Physical) and the Wellbeing of an indivdual.



The more interesting part to notice in here is that The increment in proportions of “Poor Wellbeing” in the Poor Mental Health days is quite more there is a significant increase in the Poor Wellbeing of indivduals.

Inspecting Presence of potential correlations

Poor Physical and Poor Mental Health days

df_final %>% ggplot(aes(x=PoorPhysicalHealthDays,y=PoorMentalHealthDays,colour =wellbeing)) + geom_point() + geom_abline(intercept=30, slope=-1)

Visual Representation rejects my hypothesis that there is any correlation between the Poor Physical and Poor Mental heath days.



1. No presence of correlation between the Poor Mental and Poor Health Days
2. If the Graph is colored based on the Well being of an individual some thing very important starts to pop up :
We can visually bifurcate the presence of “Poor Wellbeing and Good Wellbeing” with a line y=-x where the upper half denotes More Poor Physical and More Poor Mental Health days where individauls have Poor Wellbeing
3. The sparse distribution of points in the upper half of the line signifies relatively lesser records of individuals who have higher poor mental or physical health days.

Some College Rate and Some High School Rate in the presence of Wellbeing

df_final %>% ggplot(aes(x=SomeCollegeRate,y=HighSchoolRate,colour = wellbeing)) + geom_point() + facet_grid(~wellbeing)

There is a clear presence of correlation between the two variables. That indicates that with increase in the College Rate there is parallel increment in School Rate specific to the county.

It is also evident that the distribution more or less remains the same for both wellbeings.

We could reject the possibility of interaction on wellbeing on High School and Some College Rates

PM2.5 and Population Density in the presence of Wellbeing

df_final %>% ggplot(aes(x=PM2.5,y=PopulationDensity,colour = wellbeing)) + geom_point() + facet_wrap(~wellbeing) +ylim(0,15000)
## Warning: Removed 919 rows containing missing values (`geom_point()`).

There is a clear presence of correlation between the two variables. That indicates with increase in the Population Density there is increment in the PM2.5 specific to the county.

It is also evident that the distribution more or less remains the same for both wellbeings.

We could reject the possibility of interaction on wellbeing on High School and Some College Rates

df_final %>% ggplot(aes(x=SomeCollegeRate,y=HighSchoolRate,colour = wellbeing)) + geom_point() + facet_grid(HeavyDrinker~wellbeing)

Correlation between All Numeric Variables

# library(corrplot)
# library(sjPlot)
rr_county <- df_final[,c(2,22:33)]
rr_cm <- as.matrix(rr_county)

corr_m <- cor(rr_cm, use="pairwise.complete.obs")

Correlation Table

# correlation matrix table
tab_corr(rr_county,
         na.deletion="pairwise",
         corr.method="pearson",
         title="Correlations with pair-wise NA deletion",
         show.p=TRUE)
Correlations with pair-wise NA deletion
  age PrematureMortality HouseholdIncome ParkAccess CrimeRate UnemploymentRate WaterSafety HighSchoolRate SomeCollegeRate AccessToRecFacility FastFoodPercentage PM2.5 PopulationDensity
age   0.023*** -0.056*** -0.035*** -0.006** 0.021*** 0.005* -0.015*** -0.038*** -0.002 -0.052*** -0.016*** -0.014***
PrematureMortality 0.023***   -0.681*** -0.308*** 0.398*** 0.336*** 0.103*** -0.402*** -0.609*** -0.445*** 0.276*** 0.208*** -0.043***
HouseholdIncome -0.056*** -0.681***   0.361*** -0.211*** -0.361*** -0.093*** 0.366*** 0.660*** 0.480*** -0.034*** -0.093*** 0.141***
ParkAccess -0.035*** -0.308*** 0.361***   0.290*** -0.109*** -0.137*** -0.115*** 0.486*** 0.220*** 0.176*** -0.130*** 0.373***
CrimeRate -0.006** 0.398*** -0.211*** 0.290***   0.268*** -0.092*** -0.523*** -0.067*** -0.122*** 0.262*** 0.001 0.273***
UnemploymentRate 0.021*** 0.336*** -0.361*** -0.109*** 0.268***   -0.048*** -0.329*** -0.427*** -0.311*** 0.043*** -0.103*** 0.019***
WaterSafety 0.005* 0.103*** -0.093*** -0.137*** -0.092*** -0.048***   0.048*** -0.128*** -0.064*** 0.021*** 0.004* -0.071***
HighSchoolRate -0.015*** -0.402*** 0.366*** -0.115*** -0.523*** -0.329*** 0.048***   0.242*** 0.248*** -0.157*** -0.071*** -0.168***
SomeCollegeRate -0.038*** -0.609*** 0.660*** 0.486*** -0.067*** -0.427*** -0.128*** 0.242***   0.530*** -0.023*** 0.004* 0.200***
AccessToRecFacility -0.002 -0.445*** 0.480*** 0.220*** -0.122*** -0.311*** -0.064*** 0.248*** 0.530***   -0.262*** -0.009*** 0.182***
FastFoodPercentage -0.052*** 0.276*** -0.034*** 0.176*** 0.262*** 0.043*** 0.021*** -0.157*** -0.023*** -0.262***   0.145*** -0.011***
PM2.5 -0.016*** 0.208*** -0.093*** -0.130*** 0.001 -0.103*** 0.004* -0.071*** 0.004* -0.009*** 0.145***   -0.020***
PopulationDensity -0.014*** -0.043*** 0.141*** 0.373*** 0.273*** 0.019*** -0.071*** -0.168*** 0.200*** 0.182*** -0.011*** -0.020***  
Computed correlation used pearson-method with pairwise-deletion.

Correlation Visualization

# correlogram
testRes <- cor.mtest(rr_cm, conf.level = 0.95)
corrplot.mixed(corr_m,
               upper = "ellipse",
               p.mat = testRes$p,
               title="Correlation Matrix of Variables of Interest"
               )

### Highly Correlated Groups

pairs.panels(df_final[, c(23:24,26,29:31)], scale = TRUE, ellipses = FALSE, pch = ".", lm = TRUE, rug = FALSE, stars = TRUE)

Association between Wellbeing and County Variables

# density plots
premort_wb_plot <- df_final %>% 
  ggdensity(  x = "PrematureMortality" ,
     add = "mean", rug = FALSE,
     color = "wellbeing", fill = "wellbeing", palette=my_colors, alpha=0.25)

hincome_wb_plot <- df_final %>% 
  ggdensity(  x = "HouseholdIncome" ,
     add = "mean", rug = FALSE,
     color = "wellbeing", fill = "wellbeing", palette=my_colors, alpha=0.25) + 
  theme(legend.position="none")

park_wb_plot <- df_final %>% 
  ggdensity(  x = "ParkAccess" ,
     add = "mean", rug = FALSE,
     color = "wellbeing", fill = "wellbeing", palette=my_colors, alpha=0.25) +
  theme(legend.position="none")

crime_wb_plot <- df_final %>% 
  ggdensity(  x = "CrimeRate" ,
     add = "mean", rug = FALSE,
     color = "wellbeing", fill = "wellbeing", palette=my_colors, alpha=0.25) +
  theme(legend.position="none")

unemp_wb_plot <- df_final %>% 
  ggdensity(  x = "UnemploymentRate" ,
     add = "mean", rug = FALSE,
     color = "wellbeing", fill = "wellbeing", palette=my_colors, alpha=0.25)

water_wb_plot <- df_final %>% 
  ggdensity(  x = "WaterSafety" ,
     add = "mean", rug = FALSE,
     color = "wellbeing", fill = "wellbeing", palette=my_colors, alpha=0.25) +
  theme(legend.position="none")

rec_wb_plot <- df_final %>% 
  ggdensity(  x = "AccessToRecFacility" ,
     add = "mean", rug = FALSE,
     color = "wellbeing", fill = "wellbeing", palette=my_colors, alpha=0.25) +
  theme(legend.position="none")

fastfood_wb_plot <- df_final %>% 
  ggdensity(  x = "FastFoodPercentage" ,
     add = "mean", rug = FALSE,
     color = "wellbeing", fill = "wellbeing", palette=my_colors, alpha=0.25) +
  theme(legend.position="none")

Premature Mortality Plots

# premature mortality
premort_wb_plot + ggtitle("Density plot of county premature mortality by wellbeing") + theme(legend.position="right") + theme_bw()

df_final %>%
  ggplot(aes(x=wellbeing, y=PrematureMortality, fill=wellbeing)) +
  geom_violin(alpha=0.75) +
  theme_bw() +
  ggtitle("Violin Plot of Premature Mortality by Wellbeing Status") +
  scale_fill_manual(values=my_colors2)

# county level factors won't changed based on wellbeing, which is individual
df_final %>%
  ggplot(aes(x=HouseholdIncome, y=PrematureMortality, group=wellbeing, color=wellbeing)) +
  geom_point(shape=1,alpha=0.5) +
  theme_bw() +
  stat_smooth(method="lm", formula=y~x, geom="smooth", se=TRUE) +
  ggtitle("Scatter Plot of County-Level Mean Household Income and\nNumber of Premature Deaths per 100,000") + 
  facet_grid(~wellbeing)

df_final %>%
  ggplot(aes(x=HouseholdIncome, y=PrematureMortality, group=wellbeing, color=wellbeing)) +
  geom_point(alpha=0) +
  theme_bw() +
  stat_smooth(method="lm", formula=y~x, geom="smooth", se=TRUE) +
  ggtitle("Trendlines of County-Level Mean Household Income and\nNumber of Premature Deaths per 100,000") 

Counties with higher mean household incomes tended to have lower premature mortality rates. This trend appears consistent across individuals who reported having good wellbeing and those that reported having poor wellbeing.

df_final %>%
  ggplot(aes(x=income, y=PrematureMortality, group=income, color=income)) +
  geom_jitter(shape=1,alpha=0.5) +
  theme_bw() +
  ggtitle("Jitter Plot of Individual Income Group and\nNumber of Premature Deaths per 100,000") + 
  facet_grid(~wellbeing)

df_final %>%
  ggplot(aes(x=income, y=PrematureMortality, group=income, color=income)) +
  geom_boxplot() +
  theme_bw() +
  ggtitle("Box Plot of Individual Income Group and\nNumber of Premature Deaths per 100,000") + 
  facet_grid(~wellbeing)

Household Income Plots

hincome_wb_plot + ggtitle("Density plot of county household income by wellbeing") + theme(legend.position="right") + theme_bw()

df_final %>%
  ggplot(aes(x=HouseholdIncome, group=wellbeing, fill=wellbeing)) +
  geom_density() +
  facet_wrap(~wellbeing, ncol=1) +
  theme_bw() +
  theme(legend.position = "bottom") +
  scale_fill_manual(values=my_colors2)

df_final %>%
  ggplot(aes(x=income, y=HouseholdIncome, group=income, color=income)) +
  geom_jitter(shape=1,alpha=0.5) +
  theme_bw() +
  ggtitle("Jitter Plot of County-Level Mean Household Income and\nIndividual Income Group") + 
  facet_grid(~wellbeing)

df_final %>%
  ggplot(aes(x=income, y=HouseholdIncome, group=income, color=income)) +
  geom_boxplot() +
  theme_bw() +
  ggtitle("Box Plot of County-Level Mean Household Income and\nInividual Income Group") + 
  facet_grid(~wellbeing)

Among those with good wellbeing, there appear to be more indviduals living in counties with high household incomes.

Bin HouseholdIncome

# bin mean household income to match individual income bins
df_final <- df_final %>%
  mutate(HouseholdIncome.bin = cut(HouseholdIncome, breaks=c(0, 24999, 34999, 49999, 119525)))

df_final$HouseholdIncome.bin <- recode_factor(df_final$HouseholdIncome.bin, "(0,2.5e+04]" = "less than 25000", "(2.5e+04,3.5e+04]" = "[25000,35000)", "(3.5e+04,5e+04]" = "[35000,50000)","(5e+04,1.2e+05]" = "50000 or more")
frq(df_final$HouseholdIncome.bin)
## x <categorical> 
## # total N=275603 valid N=275603 mean=3.40 sd=0.61
## 
## Value           |      N | Raw % | Valid % | Cum. %
## ---------------------------------------------------
## less than 25000 |    214 |  0.08 |    0.08 |   0.08
## [25000,35000)   |  17292 |  6.27 |    6.27 |   6.35
## [35000,50000)   | 131487 | 47.71 |   47.71 |  54.06
## 50000 or more   | 126610 | 45.94 |   45.94 | 100.00
## <NA>            |      0 |  0.00 |    <NA> |   <NA>
# create equal sized bins
library(funModeling)
df_final$HouseholdIncome.bin.equal <- equal_freq(var=df_final$HouseholdIncome, n_bins=5)
frq(df_final$HouseholdIncome.bin.equal)
## x <categorical> 
## # total N=275603 valid N=275603 mean=2.99 sd=1.41
## 
## Value          |     N | Raw % | Valid % | Cum. %
## -------------------------------------------------
## [23751, 40704) | 55467 | 20.13 |   20.13 |  20.13
## [40704, 46233) | 54919 | 19.93 |   19.93 |  40.05
## [46233, 51106) | 55490 | 20.13 |   20.13 |  60.19
## [51106, 60397) | 55150 | 20.01 |   20.01 |  80.20
## [60397,119525] | 54577 | 19.80 |   19.80 | 100.00
## <NA>           |     0 |  0.00 |    <NA> |   <NA>
tab_xtab(df_final$HouseholdIncome.bin, df_final$wellbeing, show.col.prc = TRUE, show.exp = TRUE)  
HouseholdIncome.bin wellbeing Total
Good_Wellbeing Poor_Wellbeing
less than 25000 203
203
0.1 %
11
11
0.1 %
214
214
0.1 %
[25000,35000) 16170
16368
6.2 %
1122
924
7.6 %
17292
17292
6.3 %
[35000,50000) 123944
124464
47.5 %
7543
7023
51.2 %
131487
131487
47.7 %
50000 or more 120566
119848
46.2 %
6044
6762
41.1 %
126610
126610
45.9 %
Total 260883
260883
100 %
14720
14720
100 %
275603
275603
100 %
χ2=166.368 · df=3 · Cramer’s V=0.025 · p=0.000
# compare againt individual income
df_final %>% 
  group_by(HouseholdIncome.bin) %>%
  ggplot(aes(x = income, fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  ylab("proportion") +
  facet_wrap(~HouseholdIncome.bin) +
  theme(axis.text.x = element_text(angle = 45, vjust=1, hjust=1))

Park Access Plots

park_wb_plot + ggtitle("Density plot of county park access by wellbeing") + theme(legend.position="right") + theme_bw()

Crime Rate Plots

crime_wb_plot + ggtitle("Density plot of county crime rate by wellbeing") + theme(legend.position="right") + theme_bw()

Unemployment Rate Plots

unemp_wb_plot + ggtitle("Density plot of county unemployment rate by wellbeing") + theme(legend.position="right") + theme_bw()

Water Safety Plots

water_wb_plot + ggtitle("Density plot of Water Safety Rate by wellbeing") + theme(legend.position="right") + theme_bw()

waterlog_wb_plot <- df_final %>% 
  mutate(WaterSafetyLog = log(WaterSafety,10)) %>%
  filter(WaterSafetyLog >= 0) %>%
  ggdensity(  x = "WaterSafetyLog" ,
     color = "wellbeing", fill = "wellbeing", alpha=0.25) +
  theme(legend.position="right")

waterlog_wb_plot

# box plots
df_final %>%
  ggplot(aes(x=WaterSafety, group=wellbeing, fill=wellbeing)) +
  geom_boxplot() +
  facet_wrap(~wellbeing, ncol=1) +
  theme_bw() +
  scale_fill_manual(values=my_colors2)

Access to Recreational Facilities Plots

rec_wb_plot + ggtitle("Density plot of county recreational facility access by wellbeing") + theme(legend.position="right") + theme_bw()

Fast Food Percentage Plots

fastfood_wb_plot + ggtitle("Density plot of county fast food restaurant percentage\nby wellbeing") + theme(legend.position="right") + theme_bw()

By BMI
# BMI
df_final %>%
  ggplot(aes(x=FastFoodPercentage, group=BMI, fill=BMI)) +
  geom_density() +
  facet_wrap(~BMI) +
  theme_bw()

By CVD
# BMI
df_final %>%
  ggplot(aes(x=FastFoodPercentage, group=CVD, fill=CVD)) +
  geom_density() +
  facet_wrap(~CVD) +
  theme_bw()

By General Health
# BMI
df_final %>%
  ggplot(aes(x=FastFoodPercentage, group=GeneralHealth, fill=GeneralHealth)) +
  geom_density() +
  facet_wrap(~GeneralHealth) +
  theme_bw()

By Diabetes
# BMI
df_final %>%
  ggplot(aes(x=FastFoodPercentage, group=Diabetes, fill=Diabetes)) +
  geom_density() +
  facet_wrap(~Diabetes) +
  theme_bw()

All Plots

multiplot(premort_wb_plot, hincome_wb_plot, park_wb_plot, crime_wb_plot, unemp_wb_plot, water_wb_plot, rec_wb_plot, fastfood_wb_plot, plotlist=NULL, cols=2)

Logistic Regression Model (Statistical Methods)

All Predictors

# set up variables 
dependent <- "wellbeing"
pred.all <- c("age","SocialSupport","race","income","GeneralHealth","PoorPhysicalHealthDays","PoorMentalHealthDays","Asthma","HealthInsurance","CVD","LimitedActivity","Diabetes","employ","BMI","HeavyDrinker","CurrentSmoker","PoorSleepDays","PhysicalActivity","marital","PrematureMortality","HouseholdIncome","ParkAccess","CrimeRate","UnemploymentRate","WaterSafety","HighSchoolRate","SomeCollegeRate","AccessToRecFacility","FastFoodPercentage","PM2.5","PopulationDensity")
# initial model
model.all <- glm(wellbeing ~ 
                 age+
                 SocialSupport+
                 race+
                 income+
                 GeneralHealth+
                 PoorPhysicalHealthDays+
                 PoorMentalHealthDays+
                 Asthma+
                 HealthInsurance+
                 CVD+
                 LimitedActivity+
                 Diabetes+
                 employ+
                   BMI+
                  HeavyDrinker+
                  CurrentSmoker+
                  PoorSleepDays+
                  PhysicalActivity+
                  marital+
                  PrematureMortality+
                  HouseholdIncome+
                  ParkAccess+
                  CrimeRate+
                  UnemploymentRate+
                  WaterSafety+
                  HighSchoolRate+
                  SomeCollegeRate+
                  AccessToRecFacility+
                  FastFoodPercentage+
                  PM2.5+
                  PopulationDensity,
               family=binomial,
               data=df_final)

summary(model.all)
## 
## Call:
## glm(formula = wellbeing ~ age + SocialSupport + race + income + 
##     GeneralHealth + PoorPhysicalHealthDays + PoorMentalHealthDays + 
##     Asthma + HealthInsurance + CVD + LimitedActivity + Diabetes + 
##     employ + BMI + HeavyDrinker + CurrentSmoker + PoorSleepDays + 
##     PhysicalActivity + marital + PrematureMortality + HouseholdIncome + 
##     ParkAccess + CrimeRate + UnemploymentRate + WaterSafety + 
##     HighSchoolRate + SomeCollegeRate + AccessToRecFacility + 
##     FastFoodPercentage + PM2.5 + PopulationDensity, family = binomial, 
##     data = df_final)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5156  -0.2318  -0.1417  -0.1015   3.6273  
## 
## Coefficients:
##                                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                         -2.137e+00  2.325e-01  -9.194  < 2e-16 ***
## age                                 -4.217e-03  9.191e-04  -4.588 4.48e-06 ***
## SocialSupport Sometimes             -8.367e-01  3.217e-02 -26.010  < 2e-16 ***
## SocialSupportAlways                 -2.672e+00  3.621e-02 -73.806  < 2e-16 ***
## SocialSupportNever                  -7.592e-01  4.080e-02 -18.607  < 2e-16 ***
## SocialSupportUsually                -2.023e+00  3.455e-02 -58.559  < 2e-16 ***
## raceWhite                            3.346e-01  2.685e-02  12.460  < 2e-16 ***
## income[25000-35000)                 -5.318e-02  3.509e-02  -1.516 0.129613    
## income[35000-50000)                 -1.069e-01  3.579e-02  -2.986 0.002825 ** 
## income50000 or more                 -3.181e-01  3.415e-02  -9.314  < 2e-16 ***
## incomeless than 15000                1.992e-02  2.923e-02   0.682 0.495539    
## GeneralHealthFair                    6.526e-01  4.807e-02  13.574  < 2e-16 ***
## GeneralHealthGood                    3.519e-01  4.399e-02   8.001 1.23e-15 ***
## GeneralHealthPoor                    1.095e+00  5.525e-02  19.829  < 2e-16 ***
## GeneralHealthVery good               1.928e-01  4.481e-02   4.302 1.70e-05 ***
## PoorPhysicalHealthDays               2.667e-03  1.195e-03   2.232 0.025602 *  
## PoorMentalHealthDays                 6.146e-02  9.248e-04  66.456  < 2e-16 ***
## AsthmaNo Asthma                      4.050e-02  2.912e-02   1.391 0.164245    
## HealthInsuranceWith Health Coverage -2.387e-01  2.894e-02  -8.248  < 2e-16 ***
## CVDNo Cardiovascular Disease         4.161e-02  3.596e-02   1.157 0.247193    
## LimitedActivityNot Limited          -4.960e-01  2.502e-02 -19.826  < 2e-16 ***
## DiabetesNon Diabetic                 2.108e-03  2.899e-02   0.073 0.942033    
## employRetired                       -1.379e-02  5.539e-02  -0.249 0.803426    
## employSelf-Employed                  2.665e-01  6.257e-02   4.259 2.06e-05 ***
## employStudent                        2.313e-01  9.316e-02   2.483 0.013040 *  
## employUnable to work                 4.559e-01  5.471e-02   8.332  < 2e-16 ***
## employUnemployed for less than 1 yr  9.543e-01  6.289e-02  15.173  < 2e-16 ***
## employUnemployed for more than 1 yr  9.454e-01  5.964e-02  15.853  < 2e-16 ***
## employWaged Employement              1.724e-01  5.174e-02   3.333 0.000861 ***
## BMIObese                            -1.470e-02  2.624e-02  -0.560 0.575193    
## BMIOverweight                       -7.499e-02  2.572e-02  -2.915 0.003552 ** 
## HeavyDrinkerNot a Heavy Drinker     -2.807e-01  4.224e-02  -6.646 3.01e-11 ***
## CurrentSmokerNot a Current Smoker   -2.295e-01  2.369e-02  -9.688  < 2e-16 ***
## PoorSleepDays                        1.686e-02  9.536e-04  17.682  < 2e-16 ***
## PhysicalActivityPhysically Active   -1.364e-01  2.205e-02  -6.184 6.23e-10 ***
## maritalNot married                   4.786e-01  2.352e-02  20.346  < 2e-16 ***
## PrematureMortality                  -5.297e-04  2.043e-04  -2.592 0.009539 ** 
## HouseholdIncome                      1.840e-07  1.324e-06   0.139 0.889466    
## ParkAccess                           2.148e-03  5.709e-04   3.763 0.000168 ***
## CrimeRate                            9.953e-05  5.096e-05   1.953 0.050818 .  
## UnemploymentRate                     7.559e-03  4.678e-03   1.616 0.106126    
## WaterSafety                         -2.522e-03  7.138e-04  -3.533 0.000410 ***
## HighSchoolRate                      -2.340e-04  1.419e-03  -0.165 0.868981    
## SomeCollegeRate                      7.855e-03  1.536e-03   5.115 3.13e-07 ***
## AccessToRecFacility                 -3.286e-03  2.662e-03  -1.234 0.217029    
## FastFoodPercentage                  -1.703e-03  1.405e-03  -1.212 0.225588    
## PM2.5                                4.086e-03  7.346e-03   0.556 0.578058    
## PopulationDensity                    4.443e-06  2.577e-06   1.724 0.084624 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 114891  on 275602  degrees of freedom
## Residual deviance:  73633  on 275555  degrees of freedom
## AIC: 73729
## 
## Number of Fisher Scoring iterations: 7

Multicollinearity

Correlations for all numeric predictors

# library(corrplot)
# library(sjPlot)
rr_county <- df_final[,c(2,22:33)]
rr_cm <- as.matrix(rr_county)

corr_m <- cor(rr_cm, use="pairwise.complete.obs")
Correlation Table
# correlation matrix table
tab_corr(rr_county,
         na.deletion="pairwise",
         corr.method="pearson",
         title="Correlations with pair-wise NA deletion",
         show.p=TRUE)
Correlations with pair-wise NA deletion
  age PrematureMortality HouseholdIncome ParkAccess CrimeRate UnemploymentRate WaterSafety HighSchoolRate SomeCollegeRate AccessToRecFacility FastFoodPercentage PM2.5 PopulationDensity
age   0.023*** -0.056*** -0.035*** -0.006** 0.021*** 0.005* -0.015*** -0.038*** -0.002 -0.052*** -0.016*** -0.014***
PrematureMortality 0.023***   -0.681*** -0.308*** 0.398*** 0.336*** 0.103*** -0.402*** -0.609*** -0.445*** 0.276*** 0.208*** -0.043***
HouseholdIncome -0.056*** -0.681***   0.361*** -0.211*** -0.361*** -0.093*** 0.366*** 0.660*** 0.480*** -0.034*** -0.093*** 0.141***
ParkAccess -0.035*** -0.308*** 0.361***   0.290*** -0.109*** -0.137*** -0.115*** 0.486*** 0.220*** 0.176*** -0.130*** 0.373***
CrimeRate -0.006** 0.398*** -0.211*** 0.290***   0.268*** -0.092*** -0.523*** -0.067*** -0.122*** 0.262*** 0.001 0.273***
UnemploymentRate 0.021*** 0.336*** -0.361*** -0.109*** 0.268***   -0.048*** -0.329*** -0.427*** -0.311*** 0.043*** -0.103*** 0.019***
WaterSafety 0.005* 0.103*** -0.093*** -0.137*** -0.092*** -0.048***   0.048*** -0.128*** -0.064*** 0.021*** 0.004* -0.071***
HighSchoolRate -0.015*** -0.402*** 0.366*** -0.115*** -0.523*** -0.329*** 0.048***   0.242*** 0.248*** -0.157*** -0.071*** -0.168***
SomeCollegeRate -0.038*** -0.609*** 0.660*** 0.486*** -0.067*** -0.427*** -0.128*** 0.242***   0.530*** -0.023*** 0.004* 0.200***
AccessToRecFacility -0.002 -0.445*** 0.480*** 0.220*** -0.122*** -0.311*** -0.064*** 0.248*** 0.530***   -0.262*** -0.009*** 0.182***
FastFoodPercentage -0.052*** 0.276*** -0.034*** 0.176*** 0.262*** 0.043*** 0.021*** -0.157*** -0.023*** -0.262***   0.145*** -0.011***
PM2.5 -0.016*** 0.208*** -0.093*** -0.130*** 0.001 -0.103*** 0.004* -0.071*** 0.004* -0.009*** 0.145***   -0.020***
PopulationDensity -0.014*** -0.043*** 0.141*** 0.373*** 0.273*** 0.019*** -0.071*** -0.168*** 0.200*** 0.182*** -0.011*** -0.020***  
Computed correlation used pearson-method with pairwise-deletion.
Correlation Visualization
# correlogram
testRes <- cor.mtest(rr_cm, conf.level = 0.95)
corrplot.mixed(corr_m,
               upper = "ellipse",
               p.mat = testRes$p,
               title="Correlation Matrix of Variables of Interest",
               tl.pos = "lt")

##### Highly Correlated Groups

pairs.panels(df_final[, c(23:24,26,29:31)], scale = TRUE, ellipses = FALSE, pch = ".", lm = TRUE, rug = FALSE, stars = TRUE)

rr_county_sig <- df_final[,c(2,22,24,25,26,27,29)]
rr_cm_sig <- as.matrix(rr_county_sig)

corr_m_sig <- cor(rr_cm_sig, use="pairwise.complete.obs")
# correlogram
testRes_sig <- cor.mtest(rr_cm_sig, conf.level = 0.95)
corrplot.mixed(corr_m_sig,
               upper = "ellipse",
               p.mat = testRes_sig$p,
               tl.pos = "lt")

VIF for all Predictors Model

library(car)
library(finalfit)
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:patchwork':
## 
##     area
## The following object is masked from 'package:dplyr':
## 
##     select
vif_all <- vif(model.all)
vif_all <- as.data.frame(vif_all)
#create horizontal bar chart to display each VIF value
vif_plot_all <- vif_all %>%
  ggplot(aes(x=rownames(vif_all), y=GVIF)) +
  geom_col(color="#434343", fill="#6BD6DB") +
  xlab("All Predictors") +
  ylab("VIF Values") +
  ggtitle("VIF Values for All Predictors") +
  theme_bw() +
  geom_hline(aes(yintercept=5),color="#434343", linetype="dashed", linewidth=0.5) +
  coord_flip()

vif_plot_all

Stepwise Regression

Stepwise function

step <- stepAIC(model.all, direction="both") # Stepwise function
## Start:  AIC=73729.1
## wellbeing ~ age + SocialSupport + race + income + GeneralHealth + 
##     PoorPhysicalHealthDays + PoorMentalHealthDays + Asthma + 
##     HealthInsurance + CVD + LimitedActivity + Diabetes + employ + 
##     BMI + HeavyDrinker + CurrentSmoker + PoorSleepDays + PhysicalActivity + 
##     marital + PrematureMortality + HouseholdIncome + ParkAccess + 
##     CrimeRate + UnemploymentRate + WaterSafety + HighSchoolRate + 
##     SomeCollegeRate + AccessToRecFacility + FastFoodPercentage + 
##     PM2.5 + PopulationDensity
## 
##                          Df Deviance   AIC
## - Diabetes                1    73633 73727
## - HouseholdIncome         1    73633 73727
## - HighSchoolRate          1    73633 73727
## - PM2.5                   1    73633 73727
## - CVD                     1    73634 73728
## - FastFoodPercentage      1    73635 73729
## - AccessToRecFacility     1    73635 73729
## - Asthma                  1    73635 73729
## <none>                         73633 73729
## - UnemploymentRate        1    73636 73730
## - PopulationDensity       1    73636 73730
## - CrimeRate               1    73637 73731
## - PoorPhysicalHealthDays  1    73638 73732
## - PrematureMortality      1    73640 73734
## - BMI                     2    73643 73735
## - WaterSafety             1    73646 73740
## - ParkAccess              1    73647 73741
## - age                     1    73654 73748
## - SomeCollegeRate         1    73659 73753
## - PhysicalActivity        1    73671 73765
## - HeavyDrinker            1    73675 73769
## - HealthInsurance         1    73700 73794
## - CurrentSmoker           1    73726 73820
## - income                  4    73737 73825
## - race                    1    73792 73886
## - PoorSleepDays           1    73940 74034
## - LimitedActivity         1    74022 74116
## - marital                 1    74053 74147
## - GeneralHealth           4    74150 74238
## - employ                  7    74352 74434
## - PoorMentalHealthDays    1    77897 77991
## - SocialSupport           4    81619 81707
## 
## Step:  AIC=73727.11
## wellbeing ~ age + SocialSupport + race + income + GeneralHealth + 
##     PoorPhysicalHealthDays + PoorMentalHealthDays + Asthma + 
##     HealthInsurance + CVD + LimitedActivity + employ + BMI + 
##     HeavyDrinker + CurrentSmoker + PoorSleepDays + PhysicalActivity + 
##     marital + PrematureMortality + HouseholdIncome + ParkAccess + 
##     CrimeRate + UnemploymentRate + WaterSafety + HighSchoolRate + 
##     SomeCollegeRate + AccessToRecFacility + FastFoodPercentage + 
##     PM2.5 + PopulationDensity
## 
##                          Df Deviance   AIC
## - HouseholdIncome         1    73633 73725
## - HighSchoolRate          1    73633 73725
## - PM2.5                   1    73633 73725
## - CVD                     1    73634 73726
## - FastFoodPercentage      1    73635 73727
## - AccessToRecFacility     1    73635 73727
## - Asthma                  1    73635 73727
## <none>                         73633 73727
## - UnemploymentRate        1    73636 73728
## - PopulationDensity       1    73636 73728
## - CrimeRate               1    73637 73729
## + Diabetes                1    73633 73729
## - PoorPhysicalHealthDays  1    73638 73730
## - PrematureMortality      1    73640 73732
## - BMI                     2    73643 73733
## - WaterSafety             1    73646 73738
## - ParkAccess              1    73647 73739
## - age                     1    73654 73746
## - SomeCollegeRate         1    73659 73751
## - PhysicalActivity        1    73671 73763
## - HeavyDrinker            1    73676 73768
## - HealthInsurance         1    73700 73792
## - CurrentSmoker           1    73726 73818
## - income                  4    73737 73823
## - race                    1    73793 73885
## - PoorSleepDays           1    73940 74032
## - LimitedActivity         1    74022 74114
## - marital                 1    74053 74145
## - GeneralHealth           4    74157 74243
## - employ                  7    74352 74432
## - PoorMentalHealthDays    1    77898 77990
## - SocialSupport           4    81619 81705
## 
## Step:  AIC=73725.13
## wellbeing ~ age + SocialSupport + race + income + GeneralHealth + 
##     PoorPhysicalHealthDays + PoorMentalHealthDays + Asthma + 
##     HealthInsurance + CVD + LimitedActivity + employ + BMI + 
##     HeavyDrinker + CurrentSmoker + PoorSleepDays + PhysicalActivity + 
##     marital + PrematureMortality + ParkAccess + CrimeRate + UnemploymentRate + 
##     WaterSafety + HighSchoolRate + SomeCollegeRate + AccessToRecFacility + 
##     FastFoodPercentage + PM2.5 + PopulationDensity
## 
##                          Df Deviance   AIC
## - HighSchoolRate          1    73633 73723
## - PM2.5                   1    73633 73723
## - CVD                     1    73634 73724
## - FastFoodPercentage      1    73635 73725
## - AccessToRecFacility     1    73635 73725
## - Asthma                  1    73635 73725
## <none>                         73633 73725
## - UnemploymentRate        1    73636 73726
## - PopulationDensity       1    73636 73726
## - CrimeRate               1    73637 73727
## + HouseholdIncome         1    73633 73727
## + Diabetes                1    73633 73727
## - PoorPhysicalHealthDays  1    73638 73728
## - BMI                     2    73643 73731
## - PrematureMortality      1    73641 73731
## - WaterSafety             1    73646 73736
## - ParkAccess              1    73647 73737
## - age                     1    73654 73744
## - SomeCollegeRate         1    73662 73752
## - PhysicalActivity        1    73671 73761
## - HeavyDrinker            1    73676 73766
## - HealthInsurance         1    73700 73790
## - CurrentSmoker           1    73726 73816
## - income                  4    73738 73822
## - race                    1    73793 73883
## - PoorSleepDays           1    73940 74030
## - LimitedActivity         1    74022 74112
## - marital                 1    74053 74143
## - GeneralHealth           4    74157 74241
## - employ                  7    74352 74430
## - PoorMentalHealthDays    1    77899 77989
## - SocialSupport           4    81620 81704
## 
## Step:  AIC=73723.15
## wellbeing ~ age + SocialSupport + race + income + GeneralHealth + 
##     PoorPhysicalHealthDays + PoorMentalHealthDays + Asthma + 
##     HealthInsurance + CVD + LimitedActivity + employ + BMI + 
##     HeavyDrinker + CurrentSmoker + PoorSleepDays + PhysicalActivity + 
##     marital + PrematureMortality + ParkAccess + CrimeRate + UnemploymentRate + 
##     WaterSafety + SomeCollegeRate + AccessToRecFacility + FastFoodPercentage + 
##     PM2.5 + PopulationDensity
## 
##                          Df Deviance   AIC
## - PM2.5                   1    73633 73721
## - CVD                     1    73635 73723
## - FastFoodPercentage      1    73635 73723
## - AccessToRecFacility     1    73635 73723
## - Asthma                  1    73635 73723
## <none>                         73633 73723
## - UnemploymentRate        1    73636 73724
## - PopulationDensity       1    73636 73724
## + HighSchoolRate          1    73633 73725
## + HouseholdIncome         1    73633 73725
## + Diabetes                1    73633 73725
## - CrimeRate               1    73638 73726
## - PoorPhysicalHealthDays  1    73638 73726
## - BMI                     2    73643 73729
## - PrematureMortality      1    73641 73729
## - WaterSafety             1    73646 73734
## - ParkAccess              1    73648 73736
## - age                     1    73654 73742
## - SomeCollegeRate         1    73662 73750
## - PhysicalActivity        1    73671 73759
## - HeavyDrinker            1    73676 73764
## - HealthInsurance         1    73700 73788
## - CurrentSmoker           1    73726 73814
## - income                  4    73738 73820
## - race                    1    73793 73881
## - PoorSleepDays           1    73941 74029
## - LimitedActivity         1    74022 74110
## - marital                 1    74053 74141
## - GeneralHealth           4    74157 74239
## - employ                  7    74352 74428
## - PoorMentalHealthDays    1    77899 77987
## - SocialSupport           4    81620 81702
## 
## Step:  AIC=73721.47
## wellbeing ~ age + SocialSupport + race + income + GeneralHealth + 
##     PoorPhysicalHealthDays + PoorMentalHealthDays + Asthma + 
##     HealthInsurance + CVD + LimitedActivity + employ + BMI + 
##     HeavyDrinker + CurrentSmoker + PoorSleepDays + PhysicalActivity + 
##     marital + PrematureMortality + ParkAccess + CrimeRate + UnemploymentRate + 
##     WaterSafety + SomeCollegeRate + AccessToRecFacility + FastFoodPercentage + 
##     PopulationDensity
## 
##                          Df Deviance   AIC
## - FastFoodPercentage      1    73635 73721
## - CVD                     1    73635 73721
## - AccessToRecFacility     1    73635 73721
## - Asthma                  1    73635 73721
## <none>                         73633 73721
## - UnemploymentRate        1    73636 73722
## - PopulationDensity       1    73636 73722
## + PM2.5                   1    73633 73723
## + HighSchoolRate          1    73633 73723
## + HouseholdIncome         1    73633 73723
## + Diabetes                1    73633 73723
## - CrimeRate               1    73638 73724
## - PoorPhysicalHealthDays  1    73638 73724
## - BMI                     2    73643 73727
## - PrematureMortality      1    73641 73727
## - WaterSafety             1    73646 73732
## - ParkAccess              1    73648 73734
## - age                     1    73655 73741
## - SomeCollegeRate         1    73663 73749
## - PhysicalActivity        1    73672 73758
## - HeavyDrinker            1    73676 73762
## - HealthInsurance         1    73701 73787
## - CurrentSmoker           1    73726 73812
## - income                  4    73738 73818
## - race                    1    73795 73881
## - PoorSleepDays           1    73941 74027
## - LimitedActivity         1    74022 74108
## - marital                 1    74054 74140
## - GeneralHealth           4    74158 74238
## - employ                  7    74353 74427
## - PoorMentalHealthDays    1    77899 77985
## - SocialSupport           4    81620 81700
## 
## Step:  AIC=73720.8
## wellbeing ~ age + SocialSupport + race + income + GeneralHealth + 
##     PoorPhysicalHealthDays + PoorMentalHealthDays + Asthma + 
##     HealthInsurance + CVD + LimitedActivity + employ + BMI + 
##     HeavyDrinker + CurrentSmoker + PoorSleepDays + PhysicalActivity + 
##     marital + PrematureMortality + ParkAccess + CrimeRate + UnemploymentRate + 
##     WaterSafety + SomeCollegeRate + AccessToRecFacility + PopulationDensity
## 
##                          Df Deviance   AIC
## - AccessToRecFacility     1    73636 73720
## - CVD                     1    73636 73720
## - Asthma                  1    73637 73721
## <none>                         73635 73721
## + FastFoodPercentage      1    73633 73721
## - UnemploymentRate        1    73638 73722
## - PopulationDensity       1    73638 73722
## + PM2.5                   1    73635 73723
## + HighSchoolRate          1    73635 73723
## + HouseholdIncome         1    73635 73723
## + Diabetes                1    73635 73723
## - CrimeRate               1    73639 73723
## - PoorPhysicalHealthDays  1    73640 73724
## - BMI                     2    73645 73727
## - PrematureMortality      1    73645 73729
## - ParkAccess              1    73648 73732
## - WaterSafety             1    73648 73732
## - age                     1    73656 73740
## - SomeCollegeRate         1    73664 73748
## - PhysicalActivity        1    73673 73757
## - HeavyDrinker            1    73677 73761
## - HealthInsurance         1    73702 73786
## - CurrentSmoker           1    73728 73812
## - income                  4    73740 73818
## - race                    1    73798 73882
## - PoorSleepDays           1    73942 74026
## - LimitedActivity         1    74024 74108
## - marital                 1    74056 74140
## - GeneralHealth           4    74158 74236
## - employ                  7    74355 74427
## - PoorMentalHealthDays    1    77900 77984
## - SocialSupport           4    81622 81700
## 
## Step:  AIC=73719.81
## wellbeing ~ age + SocialSupport + race + income + GeneralHealth + 
##     PoorPhysicalHealthDays + PoorMentalHealthDays + Asthma + 
##     HealthInsurance + CVD + LimitedActivity + employ + BMI + 
##     HeavyDrinker + CurrentSmoker + PoorSleepDays + PhysicalActivity + 
##     marital + PrematureMortality + ParkAccess + CrimeRate + UnemploymentRate + 
##     WaterSafety + SomeCollegeRate + PopulationDensity
## 
##                          Df Deviance   AIC
## - CVD                     1    73637 73719
## - Asthma                  1    73638 73720
## <none>                         73636 73720
## + AccessToRecFacility     1    73635 73721
## - PopulationDensity       1    73639 73721
## - UnemploymentRate        1    73639 73721
## + FastFoodPercentage      1    73635 73721
## + PM2.5                   1    73636 73722
## + HighSchoolRate          1    73636 73722
## + HouseholdIncome         1    73636 73722
## + Diabetes                1    73636 73722
## - CrimeRate               1    73640 73722
## - PoorPhysicalHealthDays  1    73641 73723
## - BMI                     2    73646 73726
## - PrematureMortality      1    73645 73727
## - WaterSafety             1    73649 73731
## - ParkAccess              1    73649 73731
## - age                     1    73657 73739
## - SomeCollegeRate         1    73664 73746
## - PhysicalActivity        1    73674 73756
## - HeavyDrinker            1    73678 73760
## - HealthInsurance         1    73703 73785
## - CurrentSmoker           1    73729 73811
## - income                  4    73741 73817
## - race                    1    73798 73880
## - PoorSleepDays           1    73943 74025
## - LimitedActivity         1    74025 74107
## - marital                 1    74056 74138
## - GeneralHealth           4    74160 74236
## - employ                  7    74355 74425
## - PoorMentalHealthDays    1    77900 77982
## - SocialSupport           4    81623 81699
## 
## Step:  AIC=73719.15
## wellbeing ~ age + SocialSupport + race + income + GeneralHealth + 
##     PoorPhysicalHealthDays + PoorMentalHealthDays + Asthma + 
##     HealthInsurance + LimitedActivity + employ + BMI + HeavyDrinker + 
##     CurrentSmoker + PoorSleepDays + PhysicalActivity + marital + 
##     PrematureMortality + ParkAccess + CrimeRate + UnemploymentRate + 
##     WaterSafety + SomeCollegeRate + PopulationDensity
## 
##                          Df Deviance   AIC
## <none>                         73637 73719
## - Asthma                  1    73639 73719
## + CVD                     1    73636 73720
## - PopulationDensity       1    73640 73720
## + AccessToRecFacility     1    73636 73720
## - UnemploymentRate        1    73640 73720
## + FastFoodPercentage      1    73636 73720
## + PM2.5                   1    73637 73721
## + HighSchoolRate          1    73637 73721
## + HouseholdIncome         1    73637 73721
## + Diabetes                1    73637 73721
## - CrimeRate               1    73641 73721
## - PoorPhysicalHealthDays  1    73642 73722
## - BMI                     2    73647 73725
## - PrematureMortality      1    73647 73727
## - WaterSafety             1    73650 73730
## - ParkAccess              1    73651 73731
## - age                     1    73660 73740
## - SomeCollegeRate         1    73666 73746
## - PhysicalActivity        1    73675 73755
## - HeavyDrinker            1    73680 73760
## - HealthInsurance         1    73705 73785
## - CurrentSmoker           1    73730 73810
## - income                  4    73743 73817
## - race                    1    73799 73879
## - PoorSleepDays           1    73943 74023
## - LimitedActivity         1    74026 74106
## - marital                 1    74058 74138
## - GeneralHealth           4    74163 74237
## - employ                  7    74357 74425
## - PoorMentalHealthDays    1    77904 77984
## - SocialSupport           4    81625 81699

Final Stepwise model

model.step <- glm(formula= df_final$wellbeing ~ . ,
             df_final[,(colnames(step$model[-1]))], family=binomial(logit))

summary(model.step)
## 
## Call:
## glm(formula = df_final$wellbeing ~ ., family = binomial(logit), 
##     data = df_final[, (colnames(step$model[-1]))])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5218  -0.2318  -0.1418  -0.1015   3.6337  
## 
## Coefficients:
##                                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                         -2.136e+00  1.700e-01 -12.567  < 2e-16 ***
## age                                 -4.343e-03  9.086e-04  -4.779 1.76e-06 ***
## SocialSupport Sometimes             -8.361e-01  3.216e-02 -25.998  < 2e-16 ***
## SocialSupportAlways                 -2.672e+00  3.620e-02 -73.814  < 2e-16 ***
## SocialSupportNever                  -7.592e-01  4.079e-02 -18.610  < 2e-16 ***
## SocialSupportUsually                -2.023e+00  3.454e-02 -58.556  < 2e-16 ***
## raceWhite                            3.358e-01  2.672e-02  12.566  < 2e-16 ***
## income[25000-35000)                 -5.297e-02  3.508e-02  -1.510 0.130996    
## income[35000-50000)                 -1.068e-01  3.577e-02  -2.987 0.002815 ** 
## income50000 or more                 -3.186e-01  3.396e-02  -9.380  < 2e-16 ***
## incomeless than 15000                1.940e-02  2.921e-02   0.664 0.506674    
## GeneralHealthFair                    6.493e-01  4.789e-02  13.559  < 2e-16 ***
## GeneralHealthGood                    3.510e-01  4.396e-02   7.985 1.40e-15 ***
## GeneralHealthPoor                    1.088e+00  5.471e-02  19.892  < 2e-16 ***
## GeneralHealthVery good               1.929e-01  4.481e-02   4.305 1.67e-05 ***
## PoorPhysicalHealthDays               2.666e-03  1.194e-03   2.232 0.025636 *  
## PoorMentalHealthDays                 6.145e-02  9.244e-04  66.482  < 2e-16 ***
## AsthmaNo Asthma                      4.194e-02  2.910e-02   1.441 0.149508    
## HealthInsuranceWith Health Coverage -2.390e-01  2.891e-02  -8.267  < 2e-16 ***
## LimitedActivityNot Limited          -4.958e-01  2.500e-02 -19.835  < 2e-16 ***
## employRetired                       -1.492e-02  5.537e-02  -0.269 0.787574    
## employSelf-Employed                  2.677e-01  6.254e-02   4.280 1.87e-05 ***
## employStudent                        2.304e-01  9.313e-02   2.474 0.013351 *  
## employUnable to work                 4.554e-01  5.468e-02   8.328  < 2e-16 ***
## employUnemployed for less than 1 yr  9.542e-01  6.288e-02  15.176  < 2e-16 ***
## employUnemployed for more than 1 yr  9.460e-01  5.960e-02  15.873  < 2e-16 ***
## employWaged Employement              1.731e-01  5.172e-02   3.346 0.000820 ***
## BMIObese                            -1.599e-02  2.571e-02  -0.622 0.534076    
## BMIOverweight                       -7.575e-02  2.566e-02  -2.952 0.003156 ** 
## HeavyDrinkerNot a Heavy Drinker     -2.813e-01  4.221e-02  -6.663 2.68e-11 ***
## CurrentSmokerNot a Current Smoker   -2.299e-01  2.367e-02  -9.710  < 2e-16 ***
## PoorSleepDays                        1.682e-02  9.530e-04  17.650  < 2e-16 ***
## PhysicalActivityPhysically Active   -1.360e-01  2.203e-02  -6.173 6.70e-10 ***
## maritalNot married                   4.790e-01  2.350e-02  20.383  < 2e-16 ***
## PrematureMortality                  -5.347e-04  1.725e-04  -3.100 0.001934 ** 
## ParkAccess                           2.041e-03  5.496e-04   3.714 0.000204 ***
## CrimeRate                            9.690e-05  4.793e-05   2.022 0.043191 *  
## UnemploymentRate                     8.125e-03  4.568e-03   1.778 0.075334 .  
## WaterSafety                         -2.551e-03  7.127e-04  -3.580 0.000344 ***
## SomeCollegeRate                      7.366e-03  1.383e-03   5.326 1.00e-07 ***
## PopulationDensity                    4.423e-06  2.497e-06   1.771 0.076483 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 114891  on 275602  degrees of freedom
## Residual deviance:  73637  on 275562  degrees of freedom
## AIC: 73719
## 
## Number of Fisher Scoring iterations: 7

Remove statistically insignificant predictors

# setup variable 
pred.m1 <- c("age","SocialSupport","race","income","GeneralHealth","PoorPhysicalHealthDays","PoorMentalHealthDays","HealthInsurance","LimitedActivity","employ","BMI","HeavyDrinker","CurrentSmoker","PoorSleepDays","PhysicalActivity","marital","PrematureMortality","ParkAccess","CrimeRate","WaterSafety","SomeCollegeRate")
# model with reduced predictors
model.1 <- glm(wellbeing ~ 
                 age+
                 SocialSupport+
                 race+
                 income+
                 GeneralHealth+
                 PoorPhysicalHealthDays+
                 PoorMentalHealthDays+
                 HealthInsurance+
                 LimitedActivity+
                 employ+
                 BMI+
                  HeavyDrinker+
                  CurrentSmoker+
                  PoorSleepDays+
                 PhysicalActivity+
                  marital+
                  PrematureMortality+
                  ParkAccess+
                  CrimeRate+
                  WaterSafety+
                  SomeCollegeRate,
               family=binomial,
               data=df_final)

summary(model.1)
## 
## Call:
## glm(formula = wellbeing ~ age + SocialSupport + race + income + 
##     GeneralHealth + PoorPhysicalHealthDays + PoorMentalHealthDays + 
##     HealthInsurance + LimitedActivity + employ + BMI + HeavyDrinker + 
##     CurrentSmoker + PoorSleepDays + PhysicalActivity + marital + 
##     PrematureMortality + ParkAccess + CrimeRate + WaterSafety + 
##     SomeCollegeRate, family = binomial, data = df_final)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5358  -0.2317  -0.1418  -0.1015   3.6273  
## 
## Coefficients:
##                                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                         -1.9979684  0.1526996 -13.084  < 2e-16 ***
## age                                 -0.0042646  0.0009079  -4.697 2.64e-06 ***
## SocialSupport Sometimes             -0.8359781  0.0321565 -25.997  < 2e-16 ***
## SocialSupportAlways                 -2.6729660  0.0361949 -73.849  < 2e-16 ***
## SocialSupportNever                  -0.7586071  0.0407889 -18.598  < 2e-16 ***
## SocialSupportUsually                -2.0231561  0.0345371 -58.579  < 2e-16 ***
## raceWhite                            0.3310936  0.0266468  12.425  < 2e-16 ***
## income[25000-35000)                 -0.0528918  0.0350735  -1.508 0.131547    
## income[35000-50000)                 -0.1060674  0.0357617  -2.966 0.003018 ** 
## income50000 or more                 -0.3164002  0.0339561  -9.318  < 2e-16 ***
## incomeless than 15000                0.0185804  0.0292012   0.636 0.524588    
## GeneralHealthFair                    0.6465787  0.0478520  13.512  < 2e-16 ***
## GeneralHealthGood                    0.3494260  0.0439486   7.951 1.85e-15 ***
## GeneralHealthPoor                    1.0840043  0.0546523  19.835  < 2e-16 ***
## GeneralHealthVery good               0.1922407  0.0448107   4.290 1.79e-05 ***
## PoorPhysicalHealthDays               0.0026565  0.0011945   2.224 0.026155 *  
## PoorMentalHealthDays                 0.0614357  0.0009242  66.477  < 2e-16 ***
## HealthInsuranceWith Health Coverage -0.2399066  0.0288953  -8.303  < 2e-16 ***
## LimitedActivityNot Limited          -0.4931359  0.0249503 -19.765  < 2e-16 ***
## employRetired                       -0.0141648  0.0553658  -0.256 0.798075    
## employSelf-Employed                  0.2688574  0.0625238   4.300 1.71e-05 ***
## employStudent                        0.2311005  0.0931280   2.482 0.013082 *  
## employUnable to work                 0.4548113  0.0546684   8.319  < 2e-16 ***
## employUnemployed for less than 1 yr  0.9561157  0.0628808  15.205  < 2e-16 ***
## employUnemployed for more than 1 yr  0.9496466  0.0595914  15.936  < 2e-16 ***
## employWaged Employement              0.1723412  0.0517107   3.333 0.000860 ***
## BMIObese                            -0.0198489  0.0256516  -0.774 0.439056    
## BMIOverweight                       -0.0769001  0.0256529  -2.998 0.002720 ** 
## HeavyDrinkerNot a Heavy Drinker     -0.2831470  0.0422020  -6.709 1.96e-11 ***
## CurrentSmokerNot a Current Smoker   -0.2287189  0.0236696  -9.663  < 2e-16 ***
## PoorSleepDays                        0.0167672  0.0009523  17.608  < 2e-16 ***
## PhysicalActivityPhysically Active   -0.1353385  0.0220301  -6.143 8.08e-10 ***
## maritalNot married                   0.4798601  0.0234950  20.424  < 2e-16 ***
## PrematureMortality                  -0.0005309  0.0001725  -3.078 0.002086 ** 
## ParkAccess                           0.0022832  0.0005331   4.283 1.85e-05 ***
## CrimeRate                            0.0001229  0.0000468   2.626 0.008652 ** 
## WaterSafety                         -0.0026511  0.0007098  -3.735 0.000188 ***
## SomeCollegeRate                      0.0066705  0.0012933   5.158 2.50e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 114891  on 275602  degrees of freedom
## Residual deviance:  73645  on 275565  degrees of freedom
## AIC: 73721
## 
## Number of Fisher Scoring iterations: 7

Multicollinearity

Correlations for all numeric predictors

# library(corrplot)
# library(sjPlot)
rr_county.m1 <- df_final[,c(2,22, 24:27,29)]
rr_cm.m1 <- as.matrix(rr_county.m1)

corr_m1 <- cor(rr_cm.m1, use="pairwise.complete.obs")
Correlation Visualization
# correlogram
testRes.m1 <- cor.mtest(rr_cm.m1, conf.level = 0.95)
corrplot.mixed(corr_m1,
               upper = "ellipse",
               p.mat = testRes.m1$p,
               title="Correlation Matrix of Variables of Interest")

VIF for Reduced Predictors Model

vif_1 <- vif(model.1)
vif_1 <- as.data.frame(vif_1)
#create horizontal bar chart to display each VIF value
vif_plot_1 <- vif_1 %>%
  ggplot(aes(x=rownames(vif_1), y=GVIF)) +
  geom_col(color="#434343", fill="#FFB87B") +
  xlab("Statistically Significant Predictors") +
  ylab("VIF Values") +
  ggtitle("VIF Values for Reduced Model") +
  theme_bw() +
  geom_hline(aes(yintercept=5),color="#434343", linetype="dashed", linewidth=0.5) +
  coord_flip()

vif_plot_1

Odds Ratios and CI of Reduced Predictors

# forest plot
library(finalfit)
# df_final %>%
#   or_plot(dependent, pred.all)
# library(broom)
# model.1 %>%
#   tidy(conf.int=TRUE, exp=TRUE) %>%
#   datatable(rownames=FALSE,
#             colnames=c("predictor", "odds ratio", "std error", "statistic", "p-value", "CI low", "CI high"),
#             options=list(pageLength=15, scrollX=T))
# forest plot
# library(finalfit)
df_final %>%
or_plot(dependent, pred.m1)
## Waiting for profiling to be done...
## Waiting for profiling to be done...
## Waiting for profiling to be done...
## Warning: Removed 12 rows containing missing values (`geom_errorbarh()`).

Interactions

Interaction of Social support in different races

Model_Int1 <- glm(wellbeing ~ 
                    age + 
                    race:SocialSupport +
                    income +
                    GeneralHealth + 
                    PoorPhysicalHealthDays + 
                    PoorMentalHealthDays + 
                    HealthInsurance + 
                    LimitedActivity + 
                    employ +
                    BMI + 
                    HeavyDrinker + 
                    CurrentSmoker + 
                    PoorSleepDays + 
                    PhysicalActivity + 
                    marital + 
                    PrematureMortality + 
                    ParkAccess + 
                    CrimeRate + 
                    WaterSafety + 
                    SomeCollegeRate, 
                  family = "binomial", df_final)
summary(Model_Int1)
## 
## Call:
## glm(formula = wellbeing ~ age + race:SocialSupport + income + 
##     GeneralHealth + PoorPhysicalHealthDays + PoorMentalHealthDays + 
##     HealthInsurance + LimitedActivity + employ + BMI + HeavyDrinker + 
##     CurrentSmoker + PoorSleepDays + PhysicalActivity + marital + 
##     PrematureMortality + ParkAccess + CrimeRate + WaterSafety + 
##     SomeCollegeRate, family = "binomial", data = df_final)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5552  -0.2319  -0.1415  -0.1011   3.5704  
## 
## Coefficients: (1 not defined because of singularities)
##                                           Estimate Std. Error z value Pr(>|z|)
## (Intercept)                             -3.722e+00  1.510e-01 -24.650  < 2e-16
## age                                     -4.280e-03  9.083e-04  -4.712 2.45e-06
## income[25000-35000)                     -5.318e-02  3.509e-02  -1.516 0.129596
## income[35000-50000)                     -1.071e-01  3.578e-02  -2.993 0.002761
## income50000 or more                     -3.136e-01  3.397e-02  -9.231  < 2e-16
## incomeless than 15000                    2.217e-02  2.919e-02   0.759 0.447573
## GeneralHealthFair                        6.409e-01  4.785e-02  13.395  < 2e-16
## GeneralHealthGood                        3.427e-01  4.396e-02   7.798 6.31e-15
## GeneralHealthPoor                        1.078e+00  5.466e-02  19.729  < 2e-16
## GeneralHealthVery good                   1.890e-01  4.483e-02   4.217 2.48e-05
## PoorPhysicalHealthDays                   2.683e-03  1.195e-03   2.246 0.024729
## PoorMentalHealthDays                     6.128e-02  9.249e-04  66.254  < 2e-16
## HealthInsuranceWith Health Coverage     -2.398e-01  2.888e-02  -8.306  < 2e-16
## LimitedActivityNot Limited              -4.918e-01  2.496e-02 -19.699  < 2e-16
## employRetired                           -1.803e-02  5.543e-02  -0.325 0.744920
## employSelf-Employed                      2.658e-01  6.257e-02   4.247 2.16e-05
## employStudent                            2.260e-01  9.293e-02   2.432 0.015030
## employUnable to work                     4.519e-01  5.469e-02   8.263  < 2e-16
## employUnemployed for less than 1 yr      9.518e-01  6.287e-02  15.139  < 2e-16
## employUnemployed for more than 1 yr      9.464e-01  5.958e-02  15.883  < 2e-16
## employWaged Employement                  1.686e-01  5.173e-02   3.259 0.001118
## BMIObese                                -1.906e-02  2.566e-02  -0.743 0.457530
## BMIOverweight                           -7.643e-02  2.567e-02  -2.977 0.002911
## HeavyDrinkerNot a Heavy Drinker         -2.844e-01  4.225e-02  -6.732 1.67e-11
## CurrentSmokerNot a Current Smoker       -2.278e-01  2.367e-02  -9.625  < 2e-16
## PoorSleepDays                            1.671e-02  9.526e-04  17.537  < 2e-16
## PhysicalActivityPhysically Active       -1.361e-01  2.203e-02  -6.176 6.56e-10
## maritalNot married                       4.765e-01  2.352e-02  20.260  < 2e-16
## PrematureMortality                      -5.217e-04  1.725e-04  -3.024 0.002498
## ParkAccess                               2.245e-03  5.332e-04   4.211 2.54e-05
## CrimeRate                                1.232e-04  4.679e-05   2.633 0.008472
## WaterSafety                             -2.688e-03  7.111e-04  -3.780 0.000157
## SomeCollegeRate                          6.731e-03  1.294e-03   5.203 1.96e-07
## raceOther races:SocialSupport Rarely     1.558e+00  5.980e-02  26.047  < 2e-16
## raceWhite:SocialSupport Rarely           2.121e+00  3.882e-02  54.638  < 2e-16
## raceOther races:SocialSupport Sometimes  7.926e-01  4.467e-02  17.742  < 2e-16
## raceWhite:SocialSupport Sometimes        1.259e+00  3.079e-02  40.904  < 2e-16
## raceOther races:SocialSupportAlways     -7.486e-01  5.481e-02 -13.658  < 2e-16
## raceWhite:SocialSupportAlways           -6.657e-01  3.551e-02 -18.749  < 2e-16
## raceOther races:SocialSupportNever       9.439e-01  5.913e-02  15.962  < 2e-16
## raceWhite:SocialSupportNever             1.317e+00  4.424e-02  29.772  < 2e-16
## raceOther races:SocialSupportUsually    -6.791e-02  5.809e-02  -1.169 0.242414
## raceWhite:SocialSupportUsually                  NA         NA      NA       NA
##                                            
## (Intercept)                             ***
## age                                     ***
## income[25000-35000)                        
## income[35000-50000)                     ** 
## income50000 or more                     ***
## incomeless than 15000                      
## GeneralHealthFair                       ***
## GeneralHealthGood                       ***
## GeneralHealthPoor                       ***
## GeneralHealthVery good                  ***
## PoorPhysicalHealthDays                  *  
## PoorMentalHealthDays                    ***
## HealthInsuranceWith Health Coverage     ***
## LimitedActivityNot Limited              ***
## employRetired                              
## employSelf-Employed                     ***
## employStudent                           *  
## employUnable to work                    ***
## employUnemployed for less than 1 yr     ***
## employUnemployed for more than 1 yr     ***
## employWaged Employement                 ** 
## BMIObese                                   
## BMIOverweight                           ** 
## HeavyDrinkerNot a Heavy Drinker         ***
## CurrentSmokerNot a Current Smoker       ***
## PoorSleepDays                           ***
## PhysicalActivityPhysically Active       ***
## maritalNot married                      ***
## PrematureMortality                      ** 
## ParkAccess                              ***
## CrimeRate                               ** 
## WaterSafety                             ***
## SomeCollegeRate                         ***
## raceOther races:SocialSupport Rarely    ***
## raceWhite:SocialSupport Rarely          ***
## raceOther races:SocialSupport Sometimes ***
## raceWhite:SocialSupport Sometimes       ***
## raceOther races:SocialSupportAlways     ***
## raceWhite:SocialSupportAlways           ***
## raceOther races:SocialSupportNever      ***
## raceWhite:SocialSupportNever            ***
## raceOther races:SocialSupportUsually       
## raceWhite:SocialSupportUsually             
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 114891  on 275602  degrees of freedom
## Residual deviance:  73581  on 275561  degrees of freedom
## AIC: 73665
## 
## Number of Fisher Scoring iterations: 7

Interaction of Income among different races

Model_Int2 <- glm(wellbeing ~ 
                    age + 
                    race:income +
                    SocialSupport +
                    GeneralHealth + 
                    PoorPhysicalHealthDays + 
                    PoorMentalHealthDays + 
                    HealthInsurance + 
                    LimitedActivity + 
                    employ +
                    BMI + 
                    HeavyDrinker + 
                    CurrentSmoker + 
                    PoorSleepDays + 
                    PhysicalActivity + 
                    marital + 
                    PrematureMortality + 
                    ParkAccess + 
                    CrimeRate + 
                    WaterSafety + 
                    SomeCollegeRate, 
                  family = "binomial", df_final)
summary(Model_Int2)
## 
## Call:
## glm(formula = wellbeing ~ age + race:income + SocialSupport + 
##     GeneralHealth + PoorPhysicalHealthDays + PoorMentalHealthDays + 
##     HealthInsurance + LimitedActivity + employ + BMI + HeavyDrinker + 
##     CurrentSmoker + PoorSleepDays + PhysicalActivity + marital + 
##     PrematureMortality + ParkAccess + CrimeRate + WaterSafety + 
##     SomeCollegeRate, family = "binomial", data = df_final)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5374  -0.2319  -0.1414  -0.1015   3.5655  
## 
## Coefficients: (1 not defined because of singularities)
##                                         Estimate Std. Error z value Pr(>|z|)
## (Intercept)                           -1.600e+00  1.526e-01 -10.488  < 2e-16
## age                                   -4.271e-03  9.084e-04  -4.702 2.57e-06
## SocialSupport Sometimes               -8.353e-01  3.216e-02 -25.973  < 2e-16
## SocialSupportAlways                   -2.673e+00  3.620e-02 -73.850  < 2e-16
## SocialSupportNever                    -7.567e-01  4.078e-02 -18.557  < 2e-16
## SocialSupportUsually                  -2.022e+00  3.454e-02 -58.533  < 2e-16
## GeneralHealthFair                      6.453e-01  4.784e-02  13.488  < 2e-16
## GeneralHealthGood                      3.459e-01  4.395e-02   7.871 3.53e-15
## GeneralHealthPoor                      1.082e+00  5.466e-02  19.803  < 2e-16
## GeneralHealthVery good                 1.911e-01  4.481e-02   4.265 2.00e-05
## PoorPhysicalHealthDays                 2.579e-03  1.195e-03   2.158  0.03096
## PoorMentalHealthDays                   6.143e-02  9.244e-04  66.447  < 2e-16
## HealthInsuranceWith Health Coverage   -2.405e-01  2.890e-02  -8.323  < 2e-16
## LimitedActivityNot Limited            -4.920e-01  2.497e-02 -19.704  < 2e-16
## employRetired                         -2.063e-02  5.537e-02  -0.373  0.70939
## employSelf-Employed                    2.652e-01  6.254e-02   4.240 2.24e-05
## employStudent                          2.231e-01  9.309e-02   2.397  0.01655
## employUnable to work                   4.505e-01  5.468e-02   8.239  < 2e-16
## employUnemployed for less than 1 yr    9.508e-01  6.289e-02  15.119  < 2e-16
## employUnemployed for more than 1 yr    9.452e-01  5.960e-02  15.859  < 2e-16
## employWaged Employement                1.664e-01  5.173e-02   3.216  0.00130
## BMIObese                              -1.797e-02  2.566e-02  -0.700  0.48384
## BMIOverweight                         -7.519e-02  2.566e-02  -2.930  0.00339
## HeavyDrinkerNot a Heavy Drinker       -2.879e-01  4.224e-02  -6.816 9.34e-12
## CurrentSmokerNot a Current Smoker     -2.264e-01  2.369e-02  -9.557  < 2e-16
## PoorSleepDays                          1.673e-02  9.524e-04  17.567  < 2e-16
## PhysicalActivityPhysically Active     -1.343e-01  2.203e-02  -6.095 1.09e-09
## maritalNot married                     4.741e-01  2.351e-02  20.163  < 2e-16
## PrematureMortality                    -5.271e-04  1.726e-04  -3.053  0.00226
## ParkAccess                             2.267e-03  5.332e-04   4.252 2.12e-05
## CrimeRate                              1.275e-04  4.681e-05   2.724  0.00646
## WaterSafety                           -2.661e-03  7.106e-04  -3.745  0.00018
## SomeCollegeRate                        6.769e-03  1.294e-03   5.231 1.68e-07
## raceOther races:income[15000-25000)   -4.195e-01  4.816e-02  -8.710  < 2e-16
## raceWhite:income[15000-25000)         -5.328e-02  3.418e-02  -1.559  0.11896
## raceOther races:income[25000-35000)   -4.965e-01  6.992e-02  -7.101 1.24e-12
## raceWhite:income[25000-35000)         -1.031e-01  4.202e-02  -2.455  0.01411
## raceOther races:income[35000-50000)   -3.507e-01  7.278e-02  -4.818 1.45e-06
## raceWhite:income[35000-50000)         -2.006e-01  4.274e-02  -4.694 2.68e-06
## raceOther races:income50000 or more   -4.924e-01  6.511e-02  -7.562 3.97e-14
## raceWhite:income50000 or more         -4.157e-01  4.090e-02 -10.164  < 2e-16
## raceOther races:incomeless than 15000 -4.644e-01  4.475e-02 -10.376  < 2e-16
## raceWhite:incomeless than 15000               NA         NA      NA       NA
##                                          
## (Intercept)                           ***
## age                                   ***
## SocialSupport Sometimes               ***
## SocialSupportAlways                   ***
## SocialSupportNever                    ***
## SocialSupportUsually                  ***
## GeneralHealthFair                     ***
## GeneralHealthGood                     ***
## GeneralHealthPoor                     ***
## GeneralHealthVery good                ***
## PoorPhysicalHealthDays                *  
## PoorMentalHealthDays                  ***
## HealthInsuranceWith Health Coverage   ***
## LimitedActivityNot Limited            ***
## employRetired                            
## employSelf-Employed                   ***
## employStudent                         *  
## employUnable to work                  ***
## employUnemployed for less than 1 yr   ***
## employUnemployed for more than 1 yr   ***
## employWaged Employement               ** 
## BMIObese                                 
## BMIOverweight                         ** 
## HeavyDrinkerNot a Heavy Drinker       ***
## CurrentSmokerNot a Current Smoker     ***
## PoorSleepDays                         ***
## PhysicalActivityPhysically Active     ***
## maritalNot married                    ***
## PrematureMortality                    ** 
## ParkAccess                            ***
## CrimeRate                             ** 
## WaterSafety                           ***
## SomeCollegeRate                       ***
## raceOther races:income[15000-25000)   ***
## raceWhite:income[15000-25000)            
## raceOther races:income[25000-35000)   ***
## raceWhite:income[25000-35000)         *  
## raceOther races:income[35000-50000)   ***
## raceWhite:income[35000-50000)         ***
## raceOther races:income50000 or more   ***
## raceWhite:income50000 or more         ***
## raceOther races:incomeless than 15000 ***
## raceWhite:incomeless than 15000          
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 114891  on 275602  degrees of freedom
## Residual deviance:  73612  on 275561  degrees of freedom
## AIC: 73696
## 
## Number of Fisher Scoring iterations: 7

Interaction of race and general health

Model_Int3 <- glm(wellbeing ~ 
                    age + 
                    race:GeneralHealth +
                    SocialSupport +
                    income + 
                    PoorPhysicalHealthDays + 
                    PoorMentalHealthDays + 
                    HealthInsurance + 
                    LimitedActivity + 
                    employ +
                    BMI + 
                    HeavyDrinker + 
                    CurrentSmoker + 
                    PoorSleepDays + 
                    PhysicalActivity + 
                    marital + 
                    PrematureMortality + 
                    ParkAccess + 
                    CrimeRate + 
                    WaterSafety + 
                    SomeCollegeRate, 
                  family = "binomial", df_final)
summary(Model_Int3)
## 
## Call:
## glm(formula = wellbeing ~ age + race:GeneralHealth + SocialSupport + 
##     income + PoorPhysicalHealthDays + PoorMentalHealthDays + 
##     HealthInsurance + LimitedActivity + employ + BMI + HeavyDrinker + 
##     CurrentSmoker + PoorSleepDays + PhysicalActivity + marital + 
##     PrematureMortality + ParkAccess + CrimeRate + WaterSafety + 
##     SomeCollegeRate, family = "binomial", data = df_final)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5449  -0.2320  -0.1414  -0.1011   3.5341  
## 
## Coefficients: (1 not defined because of singularities)
##                                          Estimate Std. Error z value Pr(>|z|)
## (Intercept)                            -1.521e+00  1.491e-01 -10.205  < 2e-16
## age                                    -3.961e-03  9.095e-04  -4.355 1.33e-05
## SocialSupport Sometimes                -8.344e-01  3.215e-02 -25.950  < 2e-16
## SocialSupportAlways                    -2.670e+00  3.619e-02 -73.765  < 2e-16
## SocialSupportNever                     -7.584e-01  4.076e-02 -18.605  < 2e-16
## SocialSupportUsually                   -2.019e+00  3.455e-02 -58.440  < 2e-16
## income[25000-35000)                    -5.381e-02  3.507e-02  -1.534 0.124943
## income[35000-50000)                    -1.049e-01  3.576e-02  -2.934 0.003346
## income50000 or more                    -3.104e-01  3.394e-02  -9.146  < 2e-16
## incomeless than 15000                   2.220e-02  2.920e-02   0.760 0.447153
## PoorPhysicalHealthDays                  2.368e-03  1.196e-03   1.980 0.047759
## PoorMentalHealthDays                    6.148e-02  9.243e-04  66.509  < 2e-16
## HealthInsuranceWith Health Coverage    -2.408e-01  2.888e-02  -8.338  < 2e-16
## LimitedActivityNot Limited             -4.879e-01  2.499e-02 -19.523  < 2e-16
## employRetired                          -2.205e-02  5.537e-02  -0.398 0.690429
## employSelf-Employed                     2.699e-01  6.254e-02   4.316 1.59e-05
## employStudent                           2.193e-01  9.309e-02   2.356 0.018479
## employUnable to work                    4.548e-01  5.466e-02   8.321  < 2e-16
## employUnemployed for less than 1 yr     9.496e-01  6.288e-02  15.101  < 2e-16
## employUnemployed for more than 1 yr     9.460e-01  5.957e-02  15.879  < 2e-16
## employWaged Employement                 1.706e-01  5.172e-02   3.297 0.000976
## BMIObese                               -2.046e-02  2.565e-02  -0.798 0.425033
## BMIOverweight                          -7.862e-02  2.566e-02  -3.064 0.002187
## HeavyDrinkerNot a Heavy Drinker        -2.875e-01  4.224e-02  -6.806 1.00e-11
## CurrentSmokerNot a Current Smoker      -2.273e-01  2.367e-02  -9.602  < 2e-16
## PoorSleepDays                           1.685e-02  9.520e-04  17.703  < 2e-16
## PhysicalActivityPhysically Active      -1.314e-01  2.203e-02  -5.964 2.46e-09
## maritalNot married                      4.783e-01  2.350e-02  20.353  < 2e-16
## PrematureMortality                     -5.454e-04  1.725e-04  -3.162 0.001567
## ParkAccess                              2.274e-03  5.331e-04   4.265 2.00e-05
## CrimeRate                               1.240e-04  4.678e-05   2.650 0.008041
## WaterSafety                            -2.660e-03  7.104e-04  -3.744 0.000181
## SomeCollegeRate                         6.708e-03  1.293e-03   5.187 2.14e-07
## raceOther races:GeneralHealthExcellent -1.681e-01  8.260e-02  -2.035 0.041857
## raceWhite:GeneralHealthExcellent       -2.538e-01  5.104e-02  -4.972 6.64e-07
## raceOther races:GeneralHealthFair       7.128e-02  5.067e-02   1.407 0.159503
## raceWhite:GeneralHealthFair             5.221e-01  3.860e-02  13.526  < 2e-16
## raceOther races:GeneralHealthGood      -1.332e-01  4.895e-02  -2.721 0.006509
## raceWhite:GeneralHealthGood             1.875e-01  3.348e-02   5.600 2.15e-08
## raceOther races:GeneralHealthPoor       5.089e-01  6.153e-02   8.270  < 2e-16
## raceWhite:GeneralHealthPoor             9.607e-01  4.718e-02  20.361  < 2e-16
## raceOther races:GeneralHealthVery good -1.493e-01  6.158e-02  -2.424 0.015345
## raceWhite:GeneralHealthVery good               NA         NA      NA       NA
##                                           
## (Intercept)                            ***
## age                                    ***
## SocialSupport Sometimes                ***
## SocialSupportAlways                    ***
## SocialSupportNever                     ***
## SocialSupportUsually                   ***
## income[25000-35000)                       
## income[35000-50000)                    ** 
## income50000 or more                    ***
## incomeless than 15000                     
## PoorPhysicalHealthDays                 *  
## PoorMentalHealthDays                   ***
## HealthInsuranceWith Health Coverage    ***
## LimitedActivityNot Limited             ***
## employRetired                             
## employSelf-Employed                    ***
## employStudent                          *  
## employUnable to work                   ***
## employUnemployed for less than 1 yr    ***
## employUnemployed for more than 1 yr    ***
## employWaged Employement                ***
## BMIObese                                  
## BMIOverweight                          ** 
## HeavyDrinkerNot a Heavy Drinker        ***
## CurrentSmokerNot a Current Smoker      ***
## PoorSleepDays                          ***
## PhysicalActivityPhysically Active      ***
## maritalNot married                     ***
## PrematureMortality                     ** 
## ParkAccess                             ***
## CrimeRate                              ** 
## WaterSafety                            ***
## SomeCollegeRate                        ***
## raceOther races:GeneralHealthExcellent *  
## raceWhite:GeneralHealthExcellent       ***
## raceOther races:GeneralHealthFair         
## raceWhite:GeneralHealthFair            ***
## raceOther races:GeneralHealthGood      ** 
## raceWhite:GeneralHealthGood            ***
## raceOther races:GeneralHealthPoor      ***
## raceWhite:GeneralHealthPoor            ***
## raceOther races:GeneralHealthVery good *  
## raceWhite:GeneralHealthVery good          
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 114891  on 275602  degrees of freedom
## Residual deviance:  73605  on 275561  degrees of freedom
## AIC: 73689
## 
## Number of Fisher Scoring iterations: 7

Check statistical model performance

library(broom)
M1 <- glance(model.all)
M2 <- glance(model.1)
M3 <- glance(Model_Int1)
M4 <- glance(Model_Int2)
M5 <- glance(Model_Int3)

model_table <- bind_rows(M1, M2, M3, M4, M5)
datatable(model_table,
          rownames=c("All", "Reduced", "race:SocialSupport", "race:income", "race:GeneralHealth"), 
          options=list(pageLength=5,scrollX=T)) %>%
  formatRound(c(1:4:6),digits=2)
## Warning in 1:4:6: numerical expression has 4 elements: only the first used

Logistic Regression Using Machine Learning

Split the Data set

## set the seed to make your partition reproducible
set.seed(123)
train_index <- sample(seq_len(nrow(df_final)), size = 0.75*nrow(df_final))

train <- df_final[train_index, ]
test <- df_final[-train_index, ]
nrow(train)
## [1] 206702
nrow(test)
## [1] 68901

Train the Model

pacman::p_load(caret)
glm_model <- train(wellbeing ~ 
                     age + 
                     SocialSupport + 
                     race + 
                     income +  
                     GeneralHealth + 
                     PoorPhysicalHealthDays +  
                     PoorMentalHealthDays + 
                     HealthInsurance + 
                     LimitedActivity + 
                     employ + 
                     BMI + 
                     HeavyDrinker + 
                     CurrentSmoker + 
                     PoorSleepDays + 
                     PhysicalActivity +
                     marital + 
                     PrematureMortality + 
                     ParkAccess + 
                     CrimeRate + 
                     WaterSafety + 
                     SomeCollegeRate, 
                   data = train, method = "glm", family = "binomial")
attributes(glm_model)
## $names
##  [1] "method"       "modelInfo"    "modelType"    "results"      "pred"        
##  [6] "bestTune"     "call"         "dots"         "metric"       "control"     
## [11] "finalModel"   "preProcess"   "trainingData" "ptype"        "resample"    
## [16] "resampledCM"  "perfNames"    "maximize"     "yLimits"      "times"       
## [21] "levels"       "terms"        "coefnames"    "contrasts"    "xlevels"     
## 
## $class
## [1] "train"         "train.formula"
summary(glm_model)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5314  -0.2309  -0.1414  -0.1017   3.5426  
## 
## Coefficients:
##                                         Estimate Std. Error z value Pr(>|z|)
## (Intercept)                           -1.864e+00  1.769e-01 -10.537  < 2e-16
## age                                   -4.681e-03  1.049e-03  -4.464 8.05e-06
## `SocialSupport Sometimes`             -8.223e-01  3.712e-02 -22.149  < 2e-16
## SocialSupportAlways                   -2.687e+00  4.184e-02 -64.203  < 2e-16
## SocialSupportNever                    -7.524e-01  4.716e-02 -15.953  < 2e-16
## SocialSupportUsually                  -2.053e+00  4.002e-02 -51.307  < 2e-16
## raceWhite                              3.656e-01  3.086e-02  11.846  < 2e-16
## `income[25000-35000)`                 -4.069e-02  4.051e-02  -1.004 0.315258
## `income[35000-50000)`                 -1.224e-01  4.163e-02  -2.941 0.003276
## `income50000 or more`                 -2.876e-01  3.914e-02  -7.347 2.03e-13
## `incomeless than 15000`                4.284e-02  3.375e-02   1.269 0.204380
## GeneralHealthFair                      6.713e-01  5.504e-02  12.197  < 2e-16
## GeneralHealthGood                      3.373e-01  5.069e-02   6.654 2.85e-11
## GeneralHealthPoor                      1.093e+00  6.294e-02  17.361  < 2e-16
## `GeneralHealthVery good`               2.197e-01  5.156e-02   4.261 2.03e-05
## PoorPhysicalHealthDays                 3.147e-03  1.380e-03   2.281 0.022572
## PoorMentalHealthDays                   6.134e-02  1.069e-03  57.399  < 2e-16
## `HealthInsuranceWith Health Coverage` -2.508e-01  3.343e-02  -7.502 6.29e-14
## `LimitedActivityNot Limited`          -4.933e-01  2.885e-02 -17.101  < 2e-16
## employRetired                         -1.542e-02  6.419e-02  -0.240 0.810110
## `employSelf-Employed`                  2.687e-01  7.246e-02   3.709 0.000208
## employStudent                          2.325e-01  1.082e-01   2.149 0.031619
## `employUnable to work`                 4.698e-01  6.344e-02   7.405 1.31e-13
## `employUnemployed for less than 1 yr`  9.377e-01  7.304e-02  12.838  < 2e-16
## `employUnemployed for more than 1 yr`  9.333e-01  6.930e-02  13.467  < 2e-16
## `employWaged Employement`              1.661e-01  6.006e-02   2.766 0.005683
## BMIObese                              -6.533e-03  2.960e-02  -0.221 0.825346
## BMIOverweight                         -7.187e-02  2.963e-02  -2.426 0.015287
## `HeavyDrinkerNot a Heavy Drinker`     -2.918e-01  4.869e-02  -5.994 2.04e-09
## `CurrentSmokerNot a Current Smoker`   -2.524e-01  2.729e-02  -9.246  < 2e-16
## PoorSleepDays                          1.612e-02  1.103e-03  14.614  < 2e-16
## `PhysicalActivityPhysically Active`   -1.286e-01  2.546e-02  -5.051 4.40e-07
## `maritalNot married`                   4.750e-01  2.712e-02  17.513  < 2e-16
## PrematureMortality                    -7.030e-04  2.002e-04  -3.511 0.000446
## ParkAccess                             2.382e-03  6.170e-04   3.861 0.000113
## CrimeRate                              1.438e-04  5.419e-05   2.655 0.007941
## WaterSafety                           -3.075e-03  8.254e-04  -3.726 0.000195
## SomeCollegeRate                        5.702e-03  1.497e-03   3.809 0.000140
##                                          
## (Intercept)                           ***
## age                                   ***
## `SocialSupport Sometimes`             ***
## SocialSupportAlways                   ***
## SocialSupportNever                    ***
## SocialSupportUsually                  ***
## raceWhite                             ***
## `income[25000-35000)`                    
## `income[35000-50000)`                 ** 
## `income50000 or more`                 ***
## `incomeless than 15000`                  
## GeneralHealthFair                     ***
## GeneralHealthGood                     ***
## GeneralHealthPoor                     ***
## `GeneralHealthVery good`              ***
## PoorPhysicalHealthDays                *  
## PoorMentalHealthDays                  ***
## `HealthInsuranceWith Health Coverage` ***
## `LimitedActivityNot Limited`          ***
## employRetired                            
## `employSelf-Employed`                 ***
## employStudent                         *  
## `employUnable to work`                ***
## `employUnemployed for less than 1 yr` ***
## `employUnemployed for more than 1 yr` ***
## `employWaged Employement`             ** 
## BMIObese                                 
## BMIOverweight                         *  
## `HeavyDrinkerNot a Heavy Drinker`     ***
## `CurrentSmokerNot a Current Smoker`   ***
## PoorSleepDays                         ***
## `PhysicalActivityPhysically Active`   ***
## `maritalNot married`                  ***
## PrematureMortality                    ***
## ParkAccess                            ***
## CrimeRate                             ** 
## WaterSafety                           ***
## SomeCollegeRate                       ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 86611  on 206701  degrees of freedom
## Residual deviance: 55231  on 206664  degrees of freedom
## AIC: 55307
## 
## Number of Fisher Scoring iterations: 7

Evaluate the Model Performance

glm_model$metric
## [1] "Accuracy"
glm_model$results
##   parameter  Accuracy     Kappa   AccuracySD     KappaSD
## 1      none 0.9517427 0.3641728 0.0006149987 0.007055452

Check the Predictions on the Test Data Set

test$pred <- glm_model %>% predict(test)

evaluate the performance-define accuracy function

accuracy_function = function(real, predicted) {
  mean(real == predicted)
}
accuracy_function(real = test$wellbeing,
         predicted = test$pred)
## [1] 0.9527293

Variable Importance of the Variables Used

ggplot(varImp(glm_model))

ggplot(varImp(glm_model)) +
  geom_bar(stat="identity", fill="#67d1d3", colour="black") + 
  ggtitle("Impacts of Features on Wellbeing") +
  labs(y= "Relative Impacts of Features", x= "Features") +
  theme_bw() +
  theme(legend.position="none",
        axis.text.y = element_text( size=12, face="bold")) +
  expand_limits(y=0.85)

## Performance Metrics for Log Reg ML Model

confusionMtx_logreg = table(test$pred,test$wellbeing)
confusionMtx_logreg
##                 
##                  Good_Wellbeing Poor_Wellbeing
##   Good_Wellbeing          64619           2578
##   Poor_Wellbeing            679           1025

Create the Confusion Matrix for Log Reg

TP_logreg=confusionMtx_logreg[1,1]
FP_logreg=confusionMtx_logreg[1,2]
TN_logreg=confusionMtx_logreg[2,2]
FN_logreg=confusionMtx_logreg[2,1]  
TP_logreg
## [1] 64619

Calcaulating Recall

recall_logreg=TP_logreg/(TP_logreg+FN_logreg)
recall_logreg
## [1] 0.9896015

Calcualting Accuracy

Accuracy_logreg=(TP_logreg+TN_logreg)/nrow(test)
Accuracy_logreg
## [1] 0.9527293

Calacualting Specificity

specificity_logreg=TN_logreg/(TN_logreg+FP_logreg)
specificity_logreg
## [1] 0.2844852

Calculate Precision

precision_logreg=TP_logreg/(TP_logreg+FP_logreg)
precision_logreg
## [1] 0.9616352

Calculate F1-score

F1_logreg=(2*recall_logreg*precision_logreg)/(recall_logreg+precision_logreg)
F1_logreg
## [1] 0.9754179

ROC for Log Reg ML Model

# retrieve probabilities
prob_logreg<- glm_model %>% predict(test,type='prob')
str(prob_logreg)
## 'data.frame':    68901 obs. of  2 variables:
##  $ Good_Wellbeing: num  0.976 0.97 0.997 0.994 0.984 ...
##  $ Poor_Wellbeing: num  0.02428 0.03014 0.00299 0.00625 0.01555 ...
logreg.roc=roc(predictor=prob_logreg$Good_Wellbeing,response=test$wellbeing,
             levels=levels(test$wellbeing))
## Setting direction: controls > cases
logreg.roc
## 
## Call:
## roc.default(response = test$wellbeing, predictor = prob_logreg$Good_Wellbeing,     levels = levels(test$wellbeing))
## 
## Data: prob_logreg$Good_Wellbeing in 65298 controls (test$wellbeing Good_Wellbeing) > 3603 cases (test$wellbeing Poor_Wellbeing).
## Area under the curve: 0.9066
ggroc(logreg.roc, alpha = 1, colour = "red", linetype = 1, size =0.7 ) +
  geom_segment(aes(x = 1, xend = 0, y = 0, yend = 1), color="black", linetype="dashed") + 
  ggtitle("ROC Curve for Logistic Regression ML Model") +
  xlab("1 - specificity") +
  ylab("sensitivity") +
  annotate(geom="text", x=0.7, y=0.75, label="Log Reg Model classifier", color="red") +  
  annotate(geom="text", x=0.45, y=0.4, label="random classifier") +
  annotate(geom="text", x=0.1, y=0.05, label="AUC = 0.9066", color="darkred") +
  theme_bw()

CART MODEL to predict Wellbeing

Library Loading

library(rattle)
library(pROC)
library(xgboost)
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:rattle':
## 
##     xgboost
## The following object is masked from 'package:dplyr':
## 
##     slice
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:rattle':
## 
##     importance
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:psych':
## 
##     outlier

Split the Data

# using same partitioned dataset as before    

# set.seed(123)
# train_index <- sample(seq_len(nrow(df_final)), size = 0.75*nrow(df_final))
# 
# train <- df_final[train_index, ]
# test <- df_final[-train_index, ]
# cat(nrow(test),nrow(train))

Holdout

levels(train$wellbeing) <- c("Good_Wellbeing", "Poor_Wellbeing")
trcntrl_ho<-trainControl(classProbs = TRUE,summaryFunction = twoClassSummary)

CART with Auto Hyperparameter Tuning

model_CART <- train(wellbeing ~ 
                      age + 
                      SocialSupport + 
                      race + 
                      income +  
                      GeneralHealth + 
                      PoorPhysicalHealthDays +  
                      PoorMentalHealthDays + 
                      HealthInsurance + 
                      LimitedActivity + 
                      employ + 
                      BMI + 
                      HeavyDrinker + 
                      CurrentSmoker + 
                      PoorSleepDays + 
                      PhysicalActivity +
                      marital + 
                      PrematureMortality + 
                      ParkAccess + 
                      CrimeRate + 
                      WaterSafety + 
                      SomeCollegeRate ,
                    data = train, method = "rpart",trControl=trcntrl_ho,metric="ROC",tuneLength = 10)
model_CART$results
##              cp       ROC      Sens      Spec       ROCSD       SensSD
## 1  0.0006746424 0.7446683 0.9915859 0.2329208 0.016625048 0.0007384287
## 2  0.0007196186 0.7448023 0.9917384 0.2318861 0.016638739 0.0007102355
## 3  0.0007645948 0.7415792 0.9917300 0.2328093 0.004182825 0.0007047197
## 4  0.0009594915 0.7417753 0.9919693 0.2319710 0.004188718 0.0005169431
## 5  0.0009894756 0.7417870 0.9919665 0.2323544 0.004186428 0.0005654889
## 6  0.0013492849 0.7418750 0.9919461 0.2347920 0.004203448 0.0007907361
## 7  0.0029234506 0.7418689 0.9920702 0.2304524 0.004170225 0.0010178245
## 8  0.0031483314 0.7418494 0.9921898 0.2281074 0.004159639 0.0010638153
## 9  0.0140325627 0.7315662 0.9930982 0.1917173 0.048424555 0.0023999118
## 10 0.0174057749 0.6244066 0.9956310 0.1109419 0.122020455 0.0043656988
##        SpecSD
## 1  0.01152684
## 2  0.01079265
## 3  0.01170444
## 4  0.01119856
## 5  0.01160028
## 6  0.01430872
## 7  0.01787513
## 8  0.01888900
## 9  0.04803669
## 10 0.10942878
plot(model_CART)

Variable Importance of the CART Model

ggplot(varImp(model_CART))

ggplot(varImp(model_CART)) +
  geom_bar(stat="identity", fill="#FBE1B3", colour="black") + 
  ggtitle("Impacts of Features on Wellbeing") +
  labs(y= "Relative Impacts of Features", x= "Features") +
  theme_bw() +
  theme(legend.position="none",
        axis.text.y = element_text( size=12, face="bold")) +
  expand_limits(y=0.85)

print(varImp(model_CART))
## rpart variable importance
## 
##   only 20 most important variables shown (out of 46)
## 
##                                      Overall
## PoorMentalHealthDays                100.0000
## LimitedActivityNot Limited           67.1050
## GeneralHealthPoor                    64.3569
## employUnable to work                 61.1772
## PoorPhysicalHealthDays               44.3915
## SocialSupportUsually                 22.5720
## SocialSupportAlways                  18.8065
## maritalNot married                   12.5655
## SocialSupport Sometimes               8.0914
## age                                   3.8893
## CurrentSmokerNot a Current Smoker     2.1379
## employRetired                         2.0181
## raceWhite                             1.8253
## PrematureMortality                    1.2543
## ParkAccess                            0.9275
## SomeCollegeRate                       0.8861
## PoorSleepDays                         0.8672
## employUnemployed for less than 1 yr   0.8315
## employUnemployed for more than 1 yr   0.8118
## GeneralHealthGood                     0.7240

Plot the Tree

plot(model_CART$finalModel, uniform=TRUE,
     main="Classification Tree")
text(model_CART$finalModel, all=FALSE, cex=.7)

# Using the Rattle Library
library(rattle)
fancyRpartPlot(model_CART$finalModel,cex = 0.9)

Prediction on Test Data

test$pred_cart <- model_CART %>% predict(test)
prob_cart<- model_CART %>% predict(test,type='prob')
confusionMtx=table(test$pred_cart,test$wellbeing)
confusionMtx
##                 
##                  Good_Wellbeing Poor_Wellbeing
##   Good_Wellbeing          64787           2798
##   Poor_Wellbeing            511            805

Create the Confusion Matrix

TP=confusionMtx[1,1]
FP=confusionMtx[1,2]
TN=confusionMtx[2,2]
FN=confusionMtx[2,1]  
TP
## [1] 64787

Calcaulating Recall

recall=TP/(TP+FN)
recall
## [1] 0.9921743

Calcualting Accuracy

Accuracy=(TP+TN)/nrow(test)
Accuracy
## [1] 0.9519746

Calacualting Specificity

specificity=TN/(TN+FP)
specificity
## [1] 0.2234249

Calculate Precision

precision=TP/(TP+FP)
precision
## [1] 0.9586003

Calculate F1-score

F1=(2*recall*precision)/(recall+precision)
F1
## [1] 0.9750984
str(prob_cart)
## 'data.frame':    68901 obs. of  2 variables:
##  $ Good_Wellbeing: num  0.972 0.972 0.972 0.972 0.972 ...
##  $ Poor_Wellbeing: num  0.0276 0.0276 0.0276 0.0276 0.0276 ...

ROC

CART.roc=roc(predictor=prob_cart$Good_Wellbeing,response=test$wellbeing,
             levels=levels(test$wellbeing))
## Setting direction: controls > cases
CART.roc
## 
## Call:
## roc.default(response = test$wellbeing, predictor = prob_cart$Good_Wellbeing,     levels = levels(test$wellbeing))
## 
## Data: prob_cart$Good_Wellbeing in 65298 controls (test$wellbeing Good_Wellbeing) > 3603 cases (test$wellbeing Poor_Wellbeing).
## Area under the curve: 0.7389
ggroc(CART.roc, alpha = 1, colour = "red", linetype = 1, size =0.7 ) +
  geom_segment(aes(x = 1, xend = 0, y = 0, yend = 1), color="black", linetype="dashed") + 
  ggtitle("ROC Curve for CART Model") +
  xlab("1 - specificity") +
  ylab("sensitivity") +
  annotate(geom="text", x=0.7, y=0.75, label="CART Model classifier", color="red") +  
  annotate(geom="text", x=0.45, y=0.4, label="random classifier") +
  annotate(geom="text", x=0.1, y=0.05, label="AUC = 0.7388", color="darkred") +
  theme_bw()